def parse_person_info_three(self, response): """ 解析个人信息3 注:方法请求参数和app请求参数加密后不一致。经对比后也不能查找到不同点; 猜测可能是加密/解密函数影响了参数显示。 """ meta = response.meta item = meta["item"] username = item["username"] text = "" try: res_dict = self.dx_conver.convert_response_data(response.text) text = json_dumps(res_dict, ensure_ascii=False) if '"ResultCode":{"value":"0000"}' in text: self.logger.info("[电信-" + username + "]: 获取个人信息3成功!") datas = res_dict["Response"]["ResponseData"]["Data"] item["real_name"] = datas["Cust_Name"]["value"] item["status"] = UserStatus.Opened if datas["NumberStatus"] == "100000" else UserStatus.Shutdown form_data = { "content": { "fieldData": { "accnbr": username, "queryflag": "0", "queryType": "0", }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryThisMonthBill", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#Huawei DUK-AL20#", } } yield Request("https://appgo.189.cn:8443/query/bill/queryThisMonthBill", self.parse_this_month_bill, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) else: yield item tell_msg = self.get_err_msg(text, True) yield from self.error_handle(username, "电信---获取个人信息3失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield item yield from self.except_handle(username, "电信---解析个人信息3失败: %s" % text)
def exception_handle(self, condition, error_info): try: if self.name != "condition_spider": # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh" # 出现任何异常,再把出错的查询条件重新再加入到查询队列 self.push_wenshu_condition(condition) self.logger.info("parse or parse_doc error->%s" % str(error_info)) # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试 self.logger.info("sleep start!") sleep(5) # 暂停5秒钟 self.logger.info("sleep end!") # 更换代理 self.proxy = self.proxy_api.get_proxy_one() # 更换代理 self.logger.error("request retry") # 重新请求当前条件 request = Request(url=self.list_url, method='POST', callback=self.parse, body=json_dumps(self.req_data), headers=self.headers, dont_filter=True, errback=self.err_callback) self.set_proxy(request) yield request except Exception: self.exception_handle(condition, "change proxy error!")
def push_query_condition_queue(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition) except Exception: return
def _set_sms_captcha_headers_to_ssdb(self, username, token): """ 将当前的token放入ssdb中 :param username: 用户名 :param token : 用户唯一凭证 :return : None """ json_str = json_dumps({"token": token}) self.set_sms_captcha_headers_to_ssdb(json_str, username)
def ask_qrcode_status(request): """ 获取扫描二维码状态 :param request: :return: """ ret_data = {} succ = False need_refresh = False try: args = request.POST username = args["username"] account_type = args["account_type"] lg_token = args.get("lg_token", "") check_url_base = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?" \ "lgToken={lgToken}&defaulturl=https%3A%2F%2Fwww.taobao.com%2F" check_url = check_url_base.format(lgToken=lg_token) res_json = get_response_by_requests(check_url, headers=DEFAULT_HEADERS).json() session = req_session() msg = "通过扫描二维码登录失败" code = res_json.get("code") if code == "10000": msg = "请先扫描二维码" elif code == "10001": msg = "扫描成功后,请确认登录" succ = True elif code == "10004": msg = "二维码已失效,请重试" need_refresh = True elif code == "10006": redirect_url = res_json.get("url") resp = session.get(redirect_url, headers=DEFAULT_HEADERS, verify=False) if resp.status_code == 200: msg = "登录成功" cookies = session.cookies.get_dict(domain='.taobao.com') cookies_str = json_dumps(cookies) # 将登录成功的cookies信息存入ssdb,供爬虫端使用 ssdb_connect = get_ssdb_conn() key = username + ACCOUNT_CRAWLING_QRCODE_COOKIES_SSDB_SUFFIX + account_type ssdb_connect.setx(key, cookies_str, DATA_EXPIRE_TIME) succ = True else: msg = res_json.get("msg", "通过扫描二维码登录失败") except Exception: msg = "获取扫描二维码状态出错" if succ: add_ajax_ok_json(ret_data) else: ret_data["need_refresh"] = need_refresh add_ajax_error_json(ret_data, msg) return JsonResponse(ret_data)
def _send_taobao_alicloudapi(url, b64_pic): data = {'img': b64_pic, 'prob': 'false' } headers = {'Authorization': 'APPCODE ' + choice(ALICLOUDAPI_APPCODE_LIST), } response = http_post(url, data=json_dumps(data), headers=headers) data = json_loads(response.text) return [i["word"] for i in data["prism_wordsInfo"]] if "prism_wordsInfo" in data else None
def record_query_condition(self, condition, status=0): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) item = { "condition": condition, "status": status, } self.mongo_instance.insertOne(item) except Exception: return
def _dangerous_verify_scrapy(self, username, verify_url, response): """ 登录安全校验(先发送短信验证码,再提交校验) :param username: :param verify_url: :param response: :return: """ try: headers_data = json_dumps({"url": verify_url}) self.set_sms_captcha_headers_to_ssdb(headers=headers_data, username=username) self.logger.info("等待获取用户输入短信验证码中...") sms_code_data = self.ask_sms_captcha(username) if not sms_code_data: msg = "获取用户输入短信验证码失败,登录失败" yield from self.error_handle(username, msg) else: tmp_data = sms_code_data.split("_") sms_code, ret_key = tmp_data if len(tmp_data) == 2 else ("", "") self.logger.info("%s ---> sms_code:%s" % (username, sms_code)) # 获取用户指纹信息eid,fp暂未实现,获取方式:https://payrisk.jd.com/js/td.js eid, fp = ("", "") valid_url = "https://safe.jd.com/dangerousVerify/checkDownLinkCode.action" \ "?code={code}&k={k}&t={stime}&eid={eid}" \ "&fp={fp}".format(code=sms_code, k=ret_key, stime=get_js_time(), eid=eid, fp=fp) my_headers = self.headers.copy() my_headers.update({ "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": verify_url, "Host": "safe.jd.com", "Connection": "keep-alive", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" }) yield Request(url=valid_url, headers=my_headers, callback=self._parse_verify_result, meta=response.meta, dont_filter=True, errback=self.err_callback) except CaptchaTimeout: yield from self.error_handle(username, "获取短信验证码超时,登录失败") except Exception: msg = "进行安全校验出错:%s" % username yield from self.except_handle(username, msg)
def rabbitmq_sender(self, queue, item_dict): """ 对保存mq的内容进行gzip压缩和base64位编码 :param queue: 队列名 :return: """ del item_dict["_id"] content = b64encode(compress( json_dumps(item_dict).encode("utf-8"))).decode("utf-8") with RabbitmqSender(queue=queue, exchange=RABBITMQ_EXCHANGE, durable=True) as rs: rs.send(content)
def _get_telecom_bills_sms_captcha(args): """电信发送一般短信验证码""" ret_data = {} username = args["username"].strip() dx_conver = DXConvertData() url = "http://cservice.client.189.cn:8004/map/clientXML?encrypted=true" key = username + ACCOUNT_CRAWLING_SMS_HEADERS_SSDB_SUFFIX + args["account_type"] try: ssdb_conn = get_ssdb_conn() headers = ssdb_conn.get(key) if headers is not None: token = json_loads(headers)["token"] form_data = { "Request": { "HeaderInfos": { "ClientType": "#6.2.1#channel8#Huawei DUK-AL20#", "Source": "110003", "SourcePassword": "******", "Token": token, "UserLoginName": username, "Code": "getRandomV2", "Timestamp": strftime("%Y%m%d%H%M%S"), }, "Content": { "Attach": "test", "FieldData": { "PhoneNbr": username, "SceneType": "7", "Imsi": {} } } } } form_str = dx_conver.convert_request_data(form_data) sms_text = http_post(url, headers=CSERVICE_HEADERS, data=form_str, verify=False).text sms_dict = dx_conver.convert_response_data(sms_text) sms_str = json_dumps(sms_dict, ensure_ascii=False) if '"ResultCode":{"value":"0000"}' in sms_str: add_ajax_ok_json(ret_data) elif "服务中断" in sms_text: add_ajax_error_json(ret_data, "电信服务中断,请稍后再试!") else: add_ajax_error_json(ret_data, "发送失败:" + sms_str) else: add_ajax_error_json(ret_data, "无法获取短信验证码,请刷新页面重试!") except Exception: add_ajax_error_json(ret_data, "无法获取短信验证码,请重试。") return JsonResponse(ret_data)
def parse_this_month_bill(self, response): """解析当前月份话费账单""" text = response.text meta = response.meta item = meta["item"] username = item["username"] try: if '"resultCode":"0000"' in text: self.logger.info("[电信-" + username + "]: 获取当前月份话费账单成功!") datas = json_loads(text)["responseData"]["data"] item["history_bill"] = defaultdict(dict) item["history_bill"][strftime('%Y%m')] = {"all_fee": datas["sumCharge"]} # 用户名 item["real_name"] = datas["accNbrDetail"] form_data = { "content": { "fieldData": { "accnbr": username, "queryflag": "", "queryType": "0", "billingcycle": get_months_str_by_number(6, False) }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryBill", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#Huawei DUK-AL20#", } } yield Request("https://appgo.189.cn:8443/query/bill/queryBill", self.parse_bill_list, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) else: yield item tell_msg = self.get_err_msg(text) yield from self.error_handle(username, "电信---获取当前月份话费账单失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield item yield from self.except_handle(username, "电信---解析当前月份话费账单失败: %s" % text)
def parse_balance(self, response): """ 解析余额 """ text = response.text meta = response.meta item = meta["item"] username = item["username"] try: if '"resultCode":"0000"' in text: self.logger.info("[电信-" + username + "]: 获取余额成功!") datas = json_loads(text)["responseData"]["data"] if datas: item["balance"] = datas["totalBalance"] form_data = { "content": { "fieldData": { "accnbr": username, "phoneType": meta["phoneType"], "starGrade": "11", "shopId": "20002" }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryPersonalInfo", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#Huawei DUK-AL20#", } } yield Request("https://appgo.189.cn:8443/query/personal/queryPersonalInfo", self.parse_person_info_one, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) else: tell_msg = self.get_err_msg(text) yield from self.error_handle(username, "电信---获取余额失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield from self.except_handle(username, "电信---解析余额失败: %s" % text)
def parse_bill_list(self, response): """解析最近6个月份话费账单(不包含本月)""" text = response.text meta = response.meta item = meta["item"] username = item["username"] try: if '"resultCode":"0000"' in text: self.logger.info("[电信-" + username + "]: 获取最近6个月份话费账单成功!") datas = json_loads(text)["responseData"]["data"]["chargeEntities"] item["history_bill"].update({ bill["month"]: {"all_fee": bill["sumCharge"] if bill["sumCharge"] else "0.0"} for bill in datas}) form_data = { "content": { "fieldData": { "accnbr": username, "billingcycle": get_months_str_by_number(6) }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryCallRecharge", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#Huawei DUK-AL20#", } } yield Request("https://appgo.189.cn:8443/query/payMent/queryCallRecharge", self.parse_payment_list, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) else: yield item tell_msg = self.get_err_msg(text) yield from self.error_handle(username, "电信---获取最近6个月份话费账单失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield item yield from self.except_handle(username, "电信---解析最近6个月份话费账单失败: %s" % text)
def is_query_condition_exists(self, condition): try: condition = json_dumps(self.dict_sorted(condition), ensure_ascii=False) result = self.mongo_instance.getOne( filter={"condition": condition}, fields={ "condition": 1, "status": 1, "_id": 0 }) if result: return True except Exception: pass return False
def start_requests(self): # 重置请求 self.reset_req() try: request = Request( url=self.list_url, method='POST', callback=self.parse, body=json_dumps(self.req_data), headers=self.headers, dont_filter=True, errback=self.err_callback ) self.set_proxy(request) yield request except Exception: self.exception_handle(self.condition, "start_requests error")
def hanvon_alicloudapi(pic): """识别率70%""" host = 'http://text.aliapi.hanvon.com' path = '/rt/ws/v1/ocr/text/recg' appcode = 'd6147d2ef06e4ce09ce029cae877daca' querys = 'code=74e51a88-41ec-413e-b162-bd031fe0407e' url = host + path + '?' + querys data = {'uid': "118.12.0.12", "lang": "chns", "color": "black", 'image': pic } headers = {'Authorization': 'APPCODE ' + appcode, 'Content-Type': 'application/json; charset=UTF-8', } response = http_post(url, data=json_dumps(data), headers=headers) return response.text
def hanvon_table_alicloudapi(pic): """识别率70%""" host = 'http://table.aliapi.hanvon.com' path = '/rt/ws/v1/ocr/table/text/recg' appcode = 'd6147d2ef06e4ce09ce029cae877daca' querys = 'code=0d3b7d23-915a-4c6f-9886-6312440aba51' url = host + path + '?' + querys data = {'uid': "118.12.0.12", "lang": "chns", "color": "black", 'image': pic } headers = {'Authorization': 'APPCODE ' + appcode, 'Content-Type': 'application/json; charset=UTF-8', } response = http_post(url, data=json_dumps(data), headers=headers) return response.text
def load_and_store_train_data(captcha_store_directory, train_data_path, loop_count=1): with open(train_data_path, "r") as train_data_file: result = train_data_file.read() result = json_loads(result) size = len(result) + 1 for i in range(loop_count): resp = http_get("https://passport.jd.com/new/login.aspx") soup = BeautifulSoup(resp.text, "html.parser") auth_code_input = soup.select_one("#JD_Verification1") image_url = auth_code_input["src2"] resp = http_get("https:%s" % image_url) with Image.open(BytesIO(resp.content)) as image: knn_distance = get_knn_distance(image) knn_distance = str(float('%.2f' % knn_distance)) if knn_distance not in result: image.save("%s/captcha_%d.jpg" % (captcha_store_directory, size)) image.show() # 输入验证码 label = input("请输入验证码:") label = label.strip() result[knn_distance] = (knn_distance, label, "captcha_%d.jpg" % size) size += 1 with open(train_data_path, "w") as train_data_file: train_data_file.write(json_dumps(result)) # if __name__ == "__main__": load_and_store_train_data( "F:\work\公司文档\爬虫\京东验证码", "F:\software\pycharm\workspace\crawler\crawler_bqjr\crawler_bqjr\spiders\\b2c_ecommerce_spiders\\train_data.json", 20)
def login(self, response): """ 电信App登录 """ meta = response.meta item = meta["item"] item["brand"] = "电信" username = item["username"] password = item["password"] form_data = { "content": { "fieldData": { "accountType": "c2000004", "phoneNum": username, "isChinatelecom": "0", "systemVersion": "4.4.4", "authentication": password, "deviceUid": "860096537016542", "loginType": "4" }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "loginNormal", "source": "110003", "token": "null", "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel29#Huawei DUK-AL20#" } } yield Request("https://appgo.189.cn:8443/login/normal", self.parse_login, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback)
def _login(self, response): """ 进行登录 :param response: :return: """ meta = response.meta item = meta["item"] username = item["username"] try: base_url = response.urljoin("/uc/loginService") uuid = self.get_value_by_name(response, "uuid") pubKey = self.get_value_by_name(response, "pubKey") seqSid = '' login_post_url = "%s?uuid=%s&ReturnUrl=%s&r=%s&version=2015" \ % (base_url, uuid, "https%3A%2F%2Fwww.jd.com%2F", str(random())) # 密码为rsa加密 rsa = RsaUtil(key_is_hex=False) encode_pwd = rsa.encrypt(item["password"], pubkey=pubKey) temp_cookies = {} for c in response.headers.getlist('Set-Cookie', []): temp_cookies.update( dict(kv.strip().split("=", 1) for kv in c.decode().split(";") if "=" in kv)) meta["cookies"] = temp_cookies authcode = "" capthca_div = response.xpath( '//div[@id="o-authcode"][@style="display: block;"]' ).extract_first() if capthca_div: need_captcha = True else: auth_url = 'https://passport.jd.com/uc/showAuthCode?r=%s&version=2015' % str( random()) auth_page = self.http_request(auth_url, method="POST", cookies=temp_cookies, headers=self.headers, data={"loginName": username}) need_captcha = ('verifycode":true' in auth_page) code_url = response.xpath( '//img[@id="JD_Verification1"]/@src2').extract_first() if need_captcha and code_url: self.logger.info("需要输入验证码") new_headers = self.authcode_headers code_url = "https:" + code_url if not code_url.startswith( "http") else code_url code_body = self.http_request(code_url, headers=new_headers, get_str=False) # 将请求头等数据存入ssdb,方便刷新图片验证码 ssdb_headers_data = json_dumps({ "headers": new_headers, "uuid": uuid }) self.set_image_captcha_headers_to_ssdb( headers=ssdb_headers_data, username=username) authcode = self.ask_image_captcha(code_body, username) self.logger.info("验证码:%s" % authcode) login_post_data = { "uuid": uuid, "eid": self.get_value_by_name(response, "eid"), "fp": self.get_value_by_name(response, "fp"), "_t": self.get_value_by_name(response, "_t"), "loginType": self.get_value_by_name(response, "loginType"), "loginname": username, "nloginpwd": encode_pwd, "chkRememberMe": "", "authcode": authcode, "pubKey": pubKey, "sa_token": self.get_value_by_name(response, "sa_token"), "seqSid": seqSid or '' } self.logger.debug(login_post_data) yield FormRequest(url=login_post_url, headers=self.headers, cookies=meta["cookies"], callback=self._parse_login_status, meta=meta, formdata=login_post_data, dont_filter=True, errback=self.err_callback) except BadCaptchaFormat: yield from self.error_handle(username, "获取验证码图片失败") except Exception: yield from self.except_handle(username, "准备登录参数异常")
def store_train_data(data, filepath): with open(filepath, "w") as train_data_file: train_data_file.write(json_dumps(data))
def parse_person_info_two(self, response): """ 解析个人信息2 """ meta = response.meta item = meta["item"] login_token = meta["login_token"] username = item["username"] text = "" try: res_dict = self.dx_conver.convert_response_data(response.text) text = json_dumps(res_dict, ensure_ascii=False) if '"ResultCode":{"value":"0000"}' in text: self.logger.info("[电信-" + username + "]: 获取个人信息2成功!") datas = res_dict["Response"]["ResponseData"]["Data"] basic_info = datas["BasicInfo"] item["is_real_name"] = (datas["Authenticate"]["value"] == "true") item["sex"] = Sex.Male if basic_info["Sex"]["value"] == "0" else Sex.Female item["identification_number"] = basic_info["IdCardNo"]["value"] item["contact_addr"] = basic_info["Address"]["value"] # 个人信息3暂时不使用 # form_data = { # "Request": { # "HeaderInfos": { # "ClientType": "#6.2.1#channel8#samsung SM-N935F#", # "Source": "110003", # "SourcePassword": "******", # "Token": login_token, # "UserLoginName": username, # "Code": "custInfo", # "Timestamp": strftime("%Y%m%d%H%M%S") # }, # "Content": { # "Attach": "test", # "FieldData": { # "PhoneNbr": username # } # } # } # } # from_str = self.dx_conver.convert_request_data(form_data) # yield Request("http://cservice.client.189.cn:8004/map/clientXML?encrypted=true", # self.parse_person_info_three, "POST", self.cservice_headers, from_str, # meta=meta, dont_filter=True, errback=self.err_callback) # 查询通话记录需要短信验证码、身份证号码、姓名 # self._set_sms_captcha_headers_to_ssdb(username, login_token) # uid = self.need_name_idcard_sms_captcha_type(username) # sms_str = self.ask_captcha_code(uid) # sms_arr = sms_str.split("_") # sms_code, name, id_card, = sms_arr[0], sms_arr[1], sms_arr[2] # item["is_real_name"] = True # item["real_name"] = name # item["identification_number"] = id_card # 电信App bug,可以绕过身份证和姓名的检验,只需要短信验证码 self._set_sms_captcha_headers_to_ssdb(username, login_token) uid = self.need_sms_captcha_type(username) sms_code = self.ask_captcha_code(uid) item["history_call"] = {} meta["call_count"] = 0 yield self.get_call_detail(response, sms_code, date.today()) else: yield item tell_msg = self.get_err_msg(text, True) yield from self.error_handle(username, "电信---获取个人信息2失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except CaptchaTimeout: yield item yield from self.error_handle(username, "电信---解析验证码失败,等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。") except Exception: yield item yield from self.except_handle(username, "电信---解析个人信息2失败: %s" % text)
def parse_login(self, response): """ 解析登录 """ text = response.text meta = response.meta item = meta["item"] username = item["username"] try: if '"resultCode":"0000"' in text: self.logger.info("[电信-" + username + "]: 登录成功!") datas = json_loads(text)["responseData"]["data"]["loginSuccessResult"] if item["city"] == "": item["city"] = datas["cityName"] meta.setdefault("login_token", datas["token"]) meta.setdefault("phoneType", datas["phoneType"]) form_data = { "content": { "fieldData": { "queryflag": "", "payflag": "0", "destinationid": username, "shopId": "20002" }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryExpense", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#samsung SM-N935F#" } } yield Request("https://appgo.189.cn:8443/query/balance/queryExpense", self.parse_balance, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) elif '"resultCode":"3001"' in text: yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg="账号或密码错误!") elif '"resultCode":"3002"' in text: yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg="该手机号还未进行注册!") elif '"resultCode":"8105"' in text: yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg="弱密码,请重置服务密码后登录!") elif '"code":"X102"' in text: yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg="设备机型不能为空!") elif text == "": yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], "电信服务器异常"), tell_msg="电信服务器异常,请稍后再试!") else: tell_msg = self.get_err_msg(text) yield from self.error_handle(username, "电信---登录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield from self.except_handle(username, "电信---解析登录失败: %s" % text)
def telecom_bills_validation(request): """电信账单验证""" ret_data = {} args = request.POST username = args["username"].strip() sms_captcha = args["sms_captcha"] name = args["name"] id_card = args["id_card"] dx_conver = DXConvertData() url = "http://cservice.client.189.cn:8004/map/clientXML?encrypted=true" key = username + ACCOUNT_CRAWLING_SMS_HEADERS_SSDB_SUFFIX + args["account_type"] try: ssdb_conn = get_ssdb_conn() headers = ssdb_conn.get(key) if headers is not None: token = json_loads(headers)["token"] form_data = { "Request": { "HeaderInfos": { "ClientType": "#6.2.1#channel8#Huawei DUK-AL20#", "Source": "110003", "SourcePassword": "******", "Token": token, "UserLoginName": username, "Code": "randomCodeAndAuthValidate", "Timestamp": strftime("%Y%m%d%H%M%S"), }, "Content": { "Attach": "test", "FieldData": { "ShopId": "20002", "IdCardNum": id_card, "RandomCode": sms_captcha, "PhoneNum": username, "Username": name, "ValidateType": "1" } } } } form_str = dx_conver.convert_request_data(form_data) res_content = http_post(url, headers=CSERVICE_HEADERS, data=form_str, verify=False) # 返回数据解密 res_dict = dx_conver.convert_response_data(res_content.text) res_str = json_dumps(res_dict, ensure_ascii=False) if '"ResultCode": {"value": "0000"}' in res_str: add_ajax_ok_json(ret_data) elif '"ResultCode": {"value": "0001"}' in res_str: add_ajax_error_json(ret_data, "非实名制用户") elif '"ResultCode": {"value": "9152"}' in res_str: add_ajax_error_json(ret_data, "验证码错误!") else: add_ajax_error_json(ret_data, "验证失败,请重试:" + res_str) else: add_ajax_error_json(ret_data, "验证失败,没有用户信息,请刷新重试。") except Exception: add_ajax_error_json(ret_data, "验证失败,请重试。") return JsonResponse(ret_data)
def parse(self, response): self.exception_response(self.condition, response) self.logger.info("list_req_data->%s" % self.req_data) self.logger.info("skip->%s" % self.start_index) self.logger.info("parse_response->%s" % response.text) try: # 接收到数据先base64解码,再aes解密,并转换为字符串 text = decrypt(b64decode(response.text)).decode("utf-8") # 将最后]出现的位置之后的字符全部过滤掉 text = text[:text.rfind(']') + 1] docs = json_loads(text) self.logger.info("list->%s:%s" % (str(len(docs)), text)) for doc in docs: item = WenshuItem() item["case_type"] = doc.get("案件类型", "") item["sentence_date"] = doc.get("裁判日期", "") item["case_name"] = doc.get("案件名称", "") item["file_id"] = doc.get("文书ID", "") item["trial_procedure"] = doc.get("审判程序", "") item["case_no"] = doc.get("案号", "") item["court_name"] = doc.get("法院名称", "") item["relation"] = doc.get("关联文书", "") # 文书ID为空则跳过 if not item["file_id"] or self.is_wenshu_id_exists(item["file_id"]): self.logger.info("%s has saved!continue!" % item["file_id"]) continue req_data = { "fileId": item["file_id"], # 文书ID "reqtoken": get_reqtoken() # 请求token } self.logger.info("doc_req_data->%s" % req_data) request = Request( url=self.doc_url, method='POST', callback=self.parse_doc, body=json_dumps(req_data), meta={"item": item}, headers=self.headers, dont_filter=True, errback=self.err_callback ) self.set_proxy(request) yield request # 让数据起始值加分页大小,好下一次请求可以请求到下一页的数据, self.start_index += self.page_size # 记录查询条件的爬取状态(已经爬取过的状态改为1) self.record_query_condition(self.dict_sorted(self.condition), 1) # 查询结果至少第一页应该有数据,否则就可能是代理的问题 if not docs and self.start_index <= 20: self.logger.debug("查询结果至少第一页应该有数据,否则就可能是代理的问题") self.push_wenshu_condition(self.condition) self.proxy = self.proxy_api.get_proxy_one() # 更换代理 self.reset_req() # 如果没有数据或者start_index大于220了则读取下一个查询条件并改变查询的条件,且让start_index变为0 if not docs or self.start_index > 220: # 重置请求 self.reset_req() self.req_data["skip"] = str(self.start_index) request = Request( url=self.list_url, method='POST', callback=self.parse, body=json_dumps(self.req_data), headers=self.headers, dont_filter=True, errback=self.err_callback ) self.set_proxy(request) yield request except Exception: self.exception_handle(self.condition, "parse error")
def generate_query(self): """ 生成查询条件并存入到查询队列 :return: """ self.logger.info("query condition init begin!") # 查询法院(查询条件) mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_CHINACOURT_COLLECTIONS) # 设置游标不超时 cursor = mongo_instance.getAll(fields={ "_id": 1, "name": 1 }, sort=[("province", MONGO_ASCENDING)], no_cursor_timeout=True) court_list = [court["name"] for court in cursor] # 案件类型 case_type_list = ["1", "2", "3", "4", "5"] for court in court_list: count = 1 avg_interval = 0 # 当数量很大的时候直接使用总数/220的数字来代替间隔天数 avg_interval_first = 0 start_date = datetime.strptime(self.start_date, "%Y-%m-%d") end_date = datetime.strptime(self.end_date, "%Y-%m-%d") while True: divisor = (count**2) if count != 2 else 2 # 平均间隔天数 interval_day = avg_interval if avg_interval > 0 else ceil( (end_date - start_date).days / divisor) if avg_interval_first > 0: avg_interval = avg_interval_first avg_interval_first = 0 self.logger.info("interval_day->%s" % interval_day) # 生成查询时间段 end_date_temp = min(start_date + timedelta(days=interval_day), end_date) query_date = "%s TO %s" % (start_date.strftime("%Y-%m-%d"), end_date_temp.strftime("%Y-%m-%d")) self.logger.info("query_date->%s!" % query_date) query_condition = dict() query_condition["case_type"] = "0" # 所有类型 query_condition["court"] = court query_condition["date"] = query_date if self.is_query_condition_exists(query_condition): if end_date == end_date_temp: self.logger.info("%s query_condition exists!break!" % court) break else: start_date = end_date_temp + self.one_day self.logger.info( "%s query_condition exists!continue!" % json_dumps(query_condition)) continue # 查询到数量小于等于220的加到小于220的列表中,并跳出该循环 query_count = self.get_count_by_condition(court=court, date=query_date) if 0 <= query_count <= 220: if query_count > 0: self.record_query_condition(query_condition) self.push_query_condition_queue(query_condition) # 查询结果为0,只保存到mongo并且状态为-1 if query_count == 0: self.record_query_condition(query_condition, -1) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition(query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day else: if count > 1: avg_interval_first = avg_interval temp_days = (end_date_temp - start_date).days try: avg_interval = int(180 / (int(query_count) / temp_days)) except ZeroDivisionError: self.logger.exception("爬取出错,出错原因:") break # 如果间隔时间都为1天查询到的结果还大于220的话,则在保存条件的时候再增加案件类型进行保存 if temp_days == 1: for case_type in case_type_list: query_condition["case_type"] = case_type if not self.is_query_condition_exists( query_condition): self.record_query_condition(query_condition) self.push_query_condition_queue( query_condition) if end_date == end_date_temp: if count > 1: # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1 init_date = "%s TO %s" % (self.start_date, self.end_date) query_condition["date"] = init_date self.record_query_condition( query_condition, -1) self.logger.info("%s query condition end!" % court) break else: start_date = end_date_temp + self.one_day count += 1 self.logger.info("query condition init end!")
def parse_call_list(self, response): """解析通话记录""" meta = response.meta item = meta["item"] sms_code = meta["sms_code"] username = item["username"] res_text = "" try: this_month = meta["last_call_month"] this_month_str = this_month.strftime("%Y%m") res_dict = self.dx_conver.convert_response_data(response.text) res_text = json_dumps(res_dict, ensure_ascii=False) if '"ResultCode":{"value":"0000"}' in res_text: self.crawling_login(username) # 通知授权成功 self.logger.info("[电信-" + username + "]: 获取-" + this_month_str + "-通话记录成功!") item["history_call"][this_month_str] = [] datas = res_dict["Response"]["ResponseData"]["Data"] if datas: call_arr = datas["Items"]["Item"] if isinstance(call_arr, list): item["history_call"][this_month_str] = [{ "time": data["CallTime"]["value"], "duration": data["CallTimeCost"]["value"], "type": "1" if data["CallType"]["value"] == "0" else "0", "other_num": data["CallMobile"]["value"], "my_location": data["CallArea"]["value"], "fee": data["CallFee"]["value"] } for data in call_arr] else: item["history_call"][this_month_str] = [{ "time": call_arr["CallTime"]["value"], "duration": call_arr["CallTimeCost"]["value"], "type": "1" if call_arr["CallType"]["value"] == "0" else "0", "other_num": call_arr["CallMobile"]["value"], "my_location": call_arr["CallArea"]["value"], "fee": call_arr["CallFee"]["value"] }] elif '"ResultCode":{"value":"9152"}' in res_text: # 短信验证码超时,需要重新发送验证码 self.logger.info("[电信]短信验证码超时,请求重新发送短信验证码!") self._set_sms_captcha_headers_to_ssdb(username, meta["login_token"]) sms_uid = self.need_sms_captcha_type(username) sms_code = self.ask_captcha_code(sms_uid) meta["call_count"] -= 1 yield self.get_call_detail(response, sms_code, this_month) return elif '"ResultCode":{"value":"8204"}' in res_text: yield item yield from self.error_handle(username, "电信---获取通话记录失败:(username:%s, password:%s) %s" % (username, item["password"], res_text), tell_msg="此账号不支持通话记录查询,请联系电信运营商。") return elif meta.get("call_" + this_month_str + "_retry", 0) < self.retry_call_times: if '"ResultCode":{"value":"1"}' in res_text: self.logger.error("电信---获取通话记录失败:服务器异常!") elif response.text == "": self.logger.error("电信---获取通话记录失败:服务器返回值为空!") else: self.logger.error("电信---获取通话记录失败:{}".format(res_text)) yield self.retry_call_list(response, res_text) return else: self.logger.error("电信---获取通话记录失败:(username:%s, password:%s) %s" % (username, item["password"], res_text)) # 继续请求下一个月通话记录 if meta["call_count"] < self.CALL_COUNT_LIMIT: last_month = get_last_month_from_date(this_month) yield self.get_call_detail(response, sms_code, last_month) else: # 请求账单 form_data = { "content": { "fieldData": { "accnbr": username, "queryflag": "0", "queryType": "0", }, "attach": "test" }, "headerInfos": { "timestamp": strftime("%Y%m%d%H%M%S"), "code": "queryThisMonthBill", "source": "110003", "token": meta["login_token"], "userLoginName": username, "sourcePassword": "******", "clientType": "#6.2.1#channel8#Huawei DUK-AL20#", } } yield Request("https://appgo.189.cn:8443/query/bill/queryThisMonthBill", self.parse_this_month_bill, "POST", self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True, errback=self.err_callback) except CaptchaTimeout: yield item yield from self.error_handle(username, "电信---解析验证码失败,等待验证码超时", tell_msg="等待验证码超时,请刷新页面重试。") except Exception: yield item yield from self.except_handle(username, "电信---解析通话记录失败: %s" % res_text)