def get_captcha_code(self, response): """ 获取验证码并识别,返回识别的验证码 """ headers = get_headers_from_response(response) sleep_time = self.settings.get("DOWNLOAD_DELAY", 0.3) captcha_id = self.captcha_id _get_captcha_code = self._get_captcha_code while True: form_data = { "captchaId": captcha_id, "random": str(rand_0_1()), } try: captcha_body = get_content_by_requests( "http://zhixing.court.gov.cn/search/captcha.do?" + urlencode(form_data), headers=headers) except Exception: sleep(sleep_time) continue if captcha_body.startswith(b"<"): self.logger.error("被执行人---验证码:请开启JavaScript并刷新该页") sleep(self.sleep_time) continue captcha_code = _get_captcha_code(captcha_body) if len(captcha_code) == 4: return captcha_code sleep(sleep_time)
def _get_callDetail_image_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/authImg?t=" + str(rand_0_1()) self.set_image_captcha_headers_to_ssdb(headers, username) return get_content_by_requests(url, headers=headers, cookie_jar=cookiejar)
def _set_sms_captcha_headers_to_ssdb(self, username, response): """ 将当前headers信息放入ssdb中 :param username: :param response: :return: """ headers = get_headers_from_response(response) self.set_sms_captcha_headers_to_ssdb(headers, username)
def user_login(self, response): meta = response.meta item = meta['item'] username = item['username'] try: login_js = json_loads(response.text) retcode = login_js['retcode'] if retcode == '4049': # 发送验证码 headers = get_headers_from_response(response) captcha_url = meta['captcha_url'] self.set_image_captcha_headers_to_ssdb( headers, username) # 将头信息传递给 django self.set_email_img_url_to_ssdb(captcha_url, username) captcha_body = get_content_by_requests(captcha_url, headers=headers) captcha_code = self.ask_image_captcha(captcha_body, username, file_type=".png") meta["captecha_code"] = { 'door': captcha_code, 'pcid': meta['pcid'] } # 异地登录 需要验证码验证 su = self._enb64(self._url_encode(username)).decode() step1_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso' \ '&callback=sinaSSOController.preloginCallBack' \ '&su=%s&rsakt=mod&client=ssologin.js(v1.4.19)' % su yield Request(url=step1_url, callback=self.step1, meta=meta, dont_filter=True, errback=self.err_callback) elif retcode in ['101', '2070', '2079']: err_message = '登录名或密码错误!' yield from self.error_handle( username, msg="sina---登录失败:(username:%s, password:%s) %s" % (username, item['password'], err_message), tell_msg=err_message) else: meta["cross"] = login_js['crossDomainUrlList'][0] yield Request(url=login_js['crossDomainUrlList'][1], callback=self.cross_domain_one, meta=meta, dont_filter=True, errback=self.err_callback) except Exception: yield from self.except_handle( username, msg="用户登录解析异常", tell_msg="邮箱登录失败,请刷新重试", logout_request=self.get_logout_request(meta))
def parse_search(self, response): meta = response.meta item = meta['item'] try: flag = meta['flag'] mail_list = meta.setdefault("mail_list", []) the_data = json_loads(response.text)['data'] for xr in the_data['maillist']: url = 'http://m%s.mail.sina.com.cn/classic/readmail.php' \ '?webmail=1&fid=new&mid=%s&ts=17428' % (flag, xr[0]) # 拼接mid address = parseaddr(xr[1])[1] subject = xr[3] bankname = check_email_credit_card_by_address(subject, address) if bankname: mail_list.append((url, bankname, subject)) next_page = the_data['currentpage'] + 1 if the_data['pagenum'] >= next_page: # 如果存在多页,执行翻页操作 data_form = meta['data_form'] data_form['pageno'] = str(next_page) yield FormRequest(url=meta['search_url'], formdata=data_form, callback=self.parse_search, meta=meta, dont_filter=True, errback=self.err_callback) else: if not mail_list: yield from self.crawling_done( item, logout_request=self.get_logout_request(meta)) return headers = get_headers_from_response(response) parse_detail = self.parse_detail err_callback = self.err_callback count = len(mail_list) for url, bankname, subject in mail_list: yield Request(url=url, headers=headers, callback=parse_detail, meta={ 'item': item, 'bankname': bankname, 'subject': subject, 'count': count }, dont_filter=True, errback=err_callback) except Exception: yield from self.except_handle( item['username'], msg="查找账单异常", tell_msg="查找账单异常", logout_request=self.get_logout_request(meta))
def _set_sms_captcha_headers_to_ssdb(self, username, response, cookies_dict): """将当前headers信息放入ssdb中""" headers = get_headers_from_response(response) if not headers.setdefault("cookie", ""): headers["cookie"] = ";".join(k + "=" + v for k, v in cookies_dict.items()) else: headers["cookie"] += ";" + ";".join( k + "=" + v for k, v in cookies_dict.items()) self.set_sms_captcha_headers_to_ssdb(headers, username)
def _verify_callDetail_captcha(self, response, username, captcha_code): """ 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://shop.10086.cn/i/v1/res/precheck/" + username + "?captchaVal=" \ + captcha_code + "&_=" + get_js_time() info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"retCode":"000000"' in info
def get_captcha(self, response, username="", cookie_dict=None): """ 获取验证码并识别,返回识别的验证码, cookies """ # cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "http://uac.10010.com/portal/Service/CreateImage?t=" + get_js_time( ) resp = get_response_by_requests(url, headers=headers, cookie_jar=cookie_dict) return resp.content, resp.cookies.get_dict()
def verify_captcha(self, response, captcha_code): """ 移动有一个url可以在不提交表单的情况下,先检查验证码是否正确 返回是否正确 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = b"https://login.10086.cn/verifyCaptcha?inputCode=" \ + captcha_code.encode('unicode-escape', "ignore").replace(b"\\u", b"") info = get_content_by_requests(url, headers=headers, cookie_jar=cookiejar) return b'"resultCode":"0"' in info
def check_need_sms_captcha(self, response, username): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) form_data = { "accountType": "01", "account": username, "timestamp": get_js_time(), # "pwdType": "02", } info = get_content_by_requests( "https://login.10086.cn/needVerifyCode.htm?" + urlencode(form_data), headers=headers, cookie_jar=cookiejar) return b'"needVerifyCode":"1"' in info
def get_unisecid_request(self, response): cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) the_time = get_js_time() url = "https://uac.10010.com/oauth2/genqr?" + the_time r = get_response_by_requests(url, headers=headers, cookie_jar=cookiejar) cookie = r.headers.get('Set-Cookie') result = {} k_v_list = cookie.split(';') name, value = k_v_list[0].split('=') result['name'] = name result['value'] = value for k_v in islice(k_v_list, 1, None): k, v = k_v.split('=', 1) result[k] = v return result
def parse(self, response): meta = response.meta item = meta["item"] username = item["username"] password = item["password"] try: cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) headers['Referer'] = response.url url = "https://ipcrs.pbccrc.org.cn/imgrc.do?" + get_js_time() captcha_body = get_content_by_requests_post(url, headers=headers, cookie_jar=cookiejar) captcha_code = self.parse_capatcha(captcha_body) self.logger.info("验证码识别结果:%s" % captcha_code) token = response.xpath( "//input[@name='org.apache.struts.taglib.html.TOKEN']/@value" ).extract_first("") date = response.xpath( "//input[@name='date']/@value").extract_first("") datas = { "org.apache.struts.taglib.html.TOKEN": token, "method": "login", "date": date, "loginname": username, "password": password, "_@IMGRC@_": captcha_code } yield FormRequest("https://ipcrs.pbccrc.org.cn/login.do", headers=self.headers, formdata=datas, callback=self.parse_login, meta=meta, dont_filter=True) except Exception: yield from self.except_handle(username, msg="人行征信---登录入口解析失败", tell_msg="个人信息报告数据爬取失败,请刷新页面重试!")
def request_sms_code(self, response, username): """ 请求移动发送登录的短信验证码 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) form_data = {"userName": username} # # 不知道这是干嘛的 # url = "https://login.10086.cn/chkNumberAction.action" # info = get_content_by_requests_post(url, data=form_data, # headers=headers, cookie_jar=cookiejar) form_data.update({"type": "01", "channelID": "12003"}) url = "https://login.10086.cn/sendRandomCodeAction.action" info = get_content_by_requests_post(url, data=form_data, headers=headers, cookie_jar=cookiejar) return info == b'0'
def get_need_captcha_response(self, response, username, pwd_type="02"): """ 询问是否需要验证码 """ cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) the_time = get_js_time() form_data = { 'userName': username, 'pwdType': pwd_type, '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } url = "http://uac.10010.com/portal/Service/CheckNeedVerify?" + urlencode( form_data) return get_response_by_requests(url, headers=headers, cookie_jar=cookiejar)
def verify_captcha(self, response, captcha_code, cookies_dict): """ 联通有一个url可以在不提交表单的情况下,先检查验证码是否正确 返回是否正确 """ headers = get_headers_from_response(response) the_time = get_js_time() form_data = { 'verifyCode': captcha_code, 'verifyType': "1", '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } url = "https://uac.10010.com/portal/Service/CtaIdyChk?" + urlencode( form_data) info = get_content_by_requests(url, headers=headers, cookie_jar=cookies_dict) return b'"resultCode":"true"' in info
def get_ckPwd_request(self, response, username, cookie_dict=None): meta = response.meta sleep(self._get_sms_send_sleep_time(meta)) the_time = get_js_time() form_data = { 'mobile': username, 'req_time': the_time, '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } # cookiejar = get_cookiejar_from_response(response) headers = get_headers_from_response(response) url = "https://uac.10010.com/portal/Service/SendCkMSG?" + urlencode( form_data) resp = get_response_by_requests(url, headers=headers, cookie_jar=cookie_dict) meta["last_sms_time"] = time() text = resp.text return ('resultCode:"0000"' in text), text
def parse_search(self, response): meta = response.meta item = meta["item"] try: sid = meta['sid'] mail_list = meta.setdefault('mail_list', []) detail_url = self.detail_url for email_table in response.xpath('//div[@class="toarea"]/table'): address = email_table.xpath(".//span[@e]/@e").extract_first() subject = email_table.xpath( './/td[contains(@class,"gt")]' '/div/u/text()').extract_first("").strip() bankname = check_email_credit_card_by_address(subject, address) if bankname: mail_id = email_table.xpath( './/td/nobr/@mailid').extract_first() tmp_detail_url = detail_url.format(mailid=mail_id, sid=sid) mail_list.append((tmp_detail_url, bankname, subject)) page_num = meta.get('page_num') if page_num is None: nextpage_script = response.xpath( '//div[@class="right"]/script/text()').extract_first("") page_num = self.page_num_pattern.search(nextpage_script) if page_num: page_num = int(page_num.group(1)) else: page_num = int( self.page_num_pattern.search(response.text).group(1)) meta["page_num"] = page_num next_page = meta.get('current_page', 0) + 1 if page_num >= next_page: meta["current_page"] = next_page search_url = self.search_url.format(sid, next_page, self.keyword) yield Request(search_url, meta=meta, dont_filter=True, callback=self.parse_search, errback=self.err_callback) else: if not mail_list: yield from self.crawling_done(item) return headers = get_headers_from_response(response) parse_detail = self.parse_detail err_callback = self.err_callback count = len(mail_list) for url, bankname, subject in mail_list: yield Request(url, headers=headers, dont_filter=True, callback=parse_detail, errback=err_callback, meta={ 'bankname': bankname, 'item': item, 'subject': subject, 'count': count }) except Exception: yield item yield from self.except_handle(response.meta['username'], msg="查找账单异常", tell_msg="查找账单异常")
def get_login_request(self, response, username, password=None, pwd_type="02"): """ :param pwd_type: 02代表短信验证码,01代表服务密码 """ meta = response.meta the_time = get_js_time() form_data = { 'userName': username, 'password': password, 'pwdType': pwd_type, 'productType': "01", 'redirectType': "01", 'rememberMe': "0", 'req_time': the_time, 'redirectURL': "https://uac.10010.com/cust/userinfo/userInfoInit", '_': int(the_time) + 1, 'callback': "jQuery1720" + str(randint(1E16, 1E17 - 1)) + "_" + the_time } try: # 判断是否有验证码 if "02" == pwd_type and password is None: headers = get_headers_from_response(response) self.set_sms_captcha_headers_to_ssdb(headers, username) sms_uid = self.need_sms_captcha_type(username, type="general") sms_password = self.ask_captcha_code(sms_uid) form_data['password'] = sms_password meta["sms_password"] = sms_password elif "01" == pwd_type: need_ck, cookies_dict = self.cheek_need_ck_captcha( response, username) meta["cookies_dict"] = cookies_dict if need_ck: self._set_sms_captcha_headers_to_ssdb( username, response, cookies_dict) ck_uid = self.need_sms_captcha_type(username, type="login") ck_password = self.ask_captcha_code(ck_uid) form_data['verifyCKCode'] = ck_password meta["ck_password"] = ck_password except CaptchaTimeout: yield from self.error_handle(username, "联通---发送登录请求,等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。。") else: meta.setdefault('cookies_dict', {}) login_url = "https://uac.10010.com/portal/Service/MallLogin?" + urlencode( form_data) meta["login_url"] = login_url yield Request("https://uac.10010.com/portal/homeLogin", self.parse_homeLogin, dont_filter=True, meta=meta, errback=self.err_callback)
def parse_search(self, response): meta = response.meta item = meta["item"] try: host = meta['host'] mail_list = meta.setdefault("mail_list", []) ids = response.xpath('//string[@name="id"]/text()').extract() address_list = (parseaddr(address)[1] for address in response.xpath( '//string[@name="from"]/text()').extract()) text_url = host + 'js6/read/readhtml.jsp?mid=%s&font=15&color=064977' # 最终详情页面信息 for i, (subject, address) in enumerate( zip( response.xpath( '//string[@name="subject"]/text()').extract(), address_list)): bankname = check_email_credit_card_by_address(subject, address) if bankname: mail_list.append((text_url % ids[i], bankname, subject)) other_mail_ids_list = self.other_mail_ids_list_pattern.findall( response.text) if other_mail_ids_list and len(mail_list) == self.PAGE_PER_COUNT: page_num = meta.setdefault("page_num", 1) search_url = host + 'js6/s?sid={0}&func=mbox:getMessageInfos' \ '&mbox_pager_next={1}'.format(meta['sid'], page_num) meta['page_num'] += 1 other_mail_ids_str = ''.join(other_mail_ids_list) data = { 'var': '<?xml version="1.0"?><object><array name="ids">{0}</array>' '<int name="windowSize">{1}</int><boolean name="returnTag">true' '</boolean></object>'.format(other_mail_ids_str, self.PAGE_PER_COUNT) } yield FormRequest(url=search_url, callback=self.parse_search, formdata=data, meta=meta, dont_filter=True, errback=self.err_callback) else: if not mail_list: yield from self.crawling_done(meta['item']) return headers = get_headers_from_response(response) parse_detail = self.parse_detail err_callback = self.err_callback count = len(mail_list) for url, bankname, subject in mail_list: yield Request(url, headers=headers, dont_filter=True, callback=parse_detail, errback=err_callback, meta={ 'bankname': bankname, 'item': item, 'subject': subject, 'count': count }) except Exception: yield from self.except_handle(meta["item"]["username"], msg="查找账单异常", tell_msg="查找账单异常")
def parse_search(self, response): text = response.text meta = response.meta item = meta['item'] try: if not text: yield item yield from self.error_handle(item["username"], "搜狐邮箱---搜索账单没结果", tell_msg="未找到账单") return mail_list = meta.setdefault("mail_list", []) the_data = json_loads(response.text)['data'] for it in the_data['list']: subject = it['subject'] bankname = check_email_credit_card_by_address( subject, it["from"]) if bankname: detail_url = 'https://mail.sohu.com/fe/getMail?id=%s&t=%s' % ( it['id'], get_js_time()) mail_list.append((detail_url, bankname, subject)) page_step = self.page_step page_num = meta.get('page_num') if page_num is None: page_num = (the_data['total'] + page_step - 1) // page_step meta['page_num'] = page_num next_page = meta.get('current_page', 0) + 1 if page_num > next_page: meta["current_page"] = next_page search_url = self.search_url % (next_page * page_step, get_js_time(), self.keyword) yield Request(url=search_url, callback=self.parse_search, meta=meta, dont_filter=True, errback=self.err_callback) else: if not mail_list: yield from self.crawling_done( item, logout_request=self.get_logout_request(meta)) return headers = get_headers_from_response(response) parse_detail = self.parse_detail err_callback = self.err_callback count = len(mail_list) for url, bankname, subject in mail_list: yield Request(url=url, callback=parse_detail, headers=headers, meta={ 'item': item, 'bankname': bankname, 'subject': subject, 'count': count, 'cookies_dict': meta["cookies_dict"], 'ppmdig_cookies': meta["ppmdig_cookies"], }, dont_filter=True, errback=err_callback) except Exception: yield from self.except_handle( item['username'], "查找账单异常", tell_msg="查找账单异常", logout_request=self.get_logout_request(meta))