def parse_queryHistoryBill(self, response): """ 解析账单 """ text = response.text meta = response.meta item = meta["item"] try: this_month = meta["last_bill_month"] this_month_str = this_month.strftime("%Y%m") if '"result":' in text: datas = json_loads(text)["result"] fee = datas.get("allfee") if fee is not None: fee = float(fee) item["history_bill"][this_month_str] = {"all_fee": fee} elif '"historyResultList"' in text and "企业套餐" in text: yield item yield from self.error_handle(item["username"], "联通---暂不支持企业套餐用户验证。", "联通---暂不支持企业套餐用户验证。") return elif "bill_" + this_month_str + "_retry" not in meta: # 重发一次 self.logger.error("联通---重试账单:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) sleep(0.6) request = response.request.copy() request.meta["bill_" + this_month_str + "_retry"] = 1 yield request return else: self.logger.error("联通---获取账单失败:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) # 继续获取另一个月的账单 if meta["bill_count"] < self.BILL_COUNT_LIMIT: # 只需要最近6个月的 yield self._get_historyBill_request(response, this_month) else: # 获取交费记录 meta["payment_count"] = 0 item["history_payment"] = defaultdict(dict) yield self._get_payment_request(response, date.today()) except Exception: yield item yield from self.except_handle(item["username"], "联通---解析账单失败: %s" % text)
def send_sms_code(request): """ 登录发送短信验证码 :param request: :return: """ ret_data = {} try: args = request.POST session = request.session if args.get("is_first", False) == "true": username = args["username"].strip() account_type = args["account_type"] key = username + ACCOUNT_CRAWLING_SMS_HEADERS_SSDB_SUFFIX + account_type ssdb_conn = get_ssdb_conn() headers_data = ssdb_conn.get(key) if not headers_data: add_ajax_error_json(ret_data, "获取短信验证码失败") return JsonResponse(ret_data) headers_dict = json_loads(headers_data) send_url = headers_dict.get("url", "") session["send_url"] = send_url session["last_send_time"] = time() # 第一次会自动发送,默认为发送成功 res_json = {"stat": "ok", "info": {"sent": True}} else: last_send_time = session.get("last_send_time", 0) need_sleep_time = max(last_send_time + SMS_SLEEP_TIME + 2 - time(), 0) if last_send_time else 0 sleep(need_sleep_time) send_url = session.get("send_url") res_json = get_response_by_requests( send_url, headers=DEFAULT_HEADERS).json() if res_json.get("stat") == "ok" and res_json.get("info", {}).get("sent"): add_ajax_ok_json(ret_data) else: error_msg = res_json.get("info", {}).get("errorMessage") add_ajax_error_json(ret_data, error_msg or "发送短信验证码失败") except Exception: add_ajax_error_json(ret_data, "发送短信验证码出错") return JsonResponse(ret_data)
def parse_checklogin(self, response): """ 解析身份证信息 """ text = response.text meta = response.meta item = meta["item"] try: if '"userInfo":' in text: # 成功 datas = json_loads(text)["userInfo"] opendate = datas["opendate"] registration_time = opendate[:4] + "-" + opendate[ 4:6] + "-" + opendate[6:8] item["registration_time"] = registration_time item["in_nets_duration"] = get_in_nets_duration_by_start_date( registration_time) item["identification_number"] = datas["certnum"] item["identification_addr"] = datas["certaddr"] item["real_name"] = datas["custName"] item["sex"] = Sex.Male if datas[ "custsex"] == "1" else Sex.Female item["package"] = datas["brand_name"] + "-" + datas[ "packageName"] item["status"] = UserStatus.Opened if datas[ "status"] == "开通" else UserStatus.Shutdown else: self.logger.error( "联通---获取身份证信息失败:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) # # 获取是否实名制 # yield Request("http://iservice.10010.com/e3/static/transact/supRegistCheckController" # "?_=" + get_js_time(), self.parse_supRegistCheck, dont_filter=True, # meta=meta, method="POST", errback=self.err_callback) # 获取通话记录 sleep(0.6) meta["call_count"] = 0 item["history_call"] = defaultdict(dict) yield self._get_callDetail_request(response, date.today()) except Exception: yield item yield from self.except_handle(item["username"], "联通---解析身份证信息失败: %s" % text)
def parse_item(self, response): text = response.text item = response.meta["item"] try: error = False if response.status == 302: self.logger.error("被执行人---详情:访问频繁") sleep(self.sleep_time) error = True elif text == "{}": self.logger.warning("被执行人---详情验证码错误。") error = True elif "请开启J" in text: self.logger.error("被执行人---详情:请开启JavaScript并刷新该页") sleep(self.sleep_time) error = True elif text.startswith("<!DOCTYPE"): yield item return if error: form_data = parse_qs(urlsplit(response.url).query) old_captcha_code = form_data["j_captcha"][0] if old_captcha_code == self.captcha_code: self.captcha_code = self.get_captcha_code(response) form_data_new = { "id": form_data["id"][0], "j_captcha": self.captcha_code, "captchaId": self.captcha_id, } yield Request("http://zhixing.court.gov.cn/search/newdetail?" + urlencode(form_data_new), self.parse_item, dont_filter=True, meta=response.meta, errback=self.err_callback) else: data = json_loads(text) item["id"] = data.get("partyCardNum", "") item["execution_court"] = data.get("execCourtName") item["execution_money"] = data.get("execMoney") yield item except Exception: self.logger.exception("text(%s) url(%s)" % (text, response.url))
def parse_vcode_url(self, response): try: # 得到验证码请求 json_url = json_loads(response.body)['url'] yield Request(url=self.start_url + json_url, headers=self.headers, callback=self.parse_vcode, dont_filter=True, errback=self.err_callback) except Exception: self.logger.error("贷款盟---得到验证码请求访问失败!") url = self.headers.get('Referer', '') if url: yield Request(url=url, headers=self.headers, callback=self.parse, dont_filter=True, errback=self.err_callback)
def _get_phone_info_from_aliyun_api1(phone_num): """ 阿里云(api1,可以无限购买,每次购买0元1000次) """ url = "http://showphone.market.alicloudapi.com/6-1?num=%s" % phone_num req = Request(url) req.add_header('Authorization', 'APPCODE 4dda59bc51eb4fd78bcdb3c54e5c3405') data = json_loads(urlopen(req, timeout=ASK_TIMEOUT).read()) if data["showapi_res_body"]["ret_code"] == 0: result = data["showapi_res_body"] brand = space_pattern.sub("", result["name"].replace("虚拟运营商", "")) for i in ["移动", "联通", "电信"]: if brand.startswith(i): brand = i break return brand, result["prov"], result["city"].rstrip("市") else: raise Exception
def parse_extraction(self, response): """ 贷款还款明细 :param self: :param response: :return: """ meta = response.meta item = meta["item"] try: info = json_loads(response.text) item["fetch_detail"] = info.get("datalist", []) # 抓取完成 yield from self.crawling_done(item) except Exception: yield from self.except_handle(item["username"], "广州公积金中心--提取进度解析异常")
def qq_get_qrcode_status(request): """ 检查qq登录二维码图片有效性 :param request: :return: """ ret_data = {} try: args = request.POST cookies = json_loads(args.get('cookies', {})) appid = args.get('appid', '522005705') daid = args.get('daid', '4') ptqrtoken = __hash_33(cookies.get('qrsig')) scan_url = "https://ssl.ptlogin2.qq.com/ptqrlogin?u1=https%3A%2F%2Fmail.qq.com%2Fcgi-bin%2Freadtemplate%3" \ "Fcheck%3Dfalse%26t%3Dloginpage_new_jump%26vt%3Dpassport%26vm%3Dwpt%26ft%3Dlogi" \ "npage%26target%3D&ptqrtoken={0}&ptredirect=0&h=1&t=1&g=1&from_ui=1&pt" \ "lang=2052&action=1-1-1513651703600&js_ver=10232&js_type=1&login_s" \ "ig=&pt_uistyle=25&aid={1}&daid={2}&".format(ptqrtoken, appid, daid) headers = QQ_GET_QRCODE_STATUS_HEADERS.copy() headers['Cookie'] = "qrsig=" + cookies.get('qrsig') scan_text = http_get(scan_url, headers=headers, cookies=cookies).text qr_result_list = qq_qr_result_list_pattern.search(scan_text).group( 1).split(',') qr_result_code = qq_qr_result_info_pattern.search( qr_result_list[0]).group(1) qr_result_url = qq_qr_result_info_pattern.search( qr_result_list[2]).group(1) qr_result_status = qq_qr_result_info_pattern.search( qr_result_list[4]).group(1) qr_result_nick_name = qq_qr_result_info_pattern.search( qr_result_list[5]).group(1) data = { 'qr_code': qr_result_code, 'qr_url': qr_result_url, 'qr_status': qr_result_status, 'qr_nick_name': qr_result_nick_name } ret_data["data"] = data add_ajax_ok_json(ret_data) except Exception: add_ajax_error_json(ret_data, "二维码失效") finally: return JsonResponse(ret_data)
def parse_spdb_credit_email_html(html_string, subject=""): bs_obj = BeautifulSoup(html_string, "lxml") bill_info = {} account_info_str = bs_obj.find("td").getText(strip=True) find_name = spdb_name_pattern.search(account_info_str) if find_name: bill_info['real_name'] = find_name.group(1) find_repayment = spdb_repayment_pattern.search(account_info_str) if find_repayment: bill_info['repayment'] = find_repayment.group(1).replace(",", "") find_due_date = spdb_due_date_pattern.search(account_info_str) if find_due_date: bill_info['due_date'] = find_due_date.group(1).replace("/", DATE_SEP) try: url1 = bs_obj.find('span', text='点击').findParent("table").find('a').get('href') headers = SPDB_HEADERS.copy() r1 = http_get(url1, headers=headers) cookie_str = r1.headers.get('Set-Cookie') headers['Cookie'] = cookie_str url2 = 'https://ebill.spdbccc.com.cn/cloudbank-portal/myBillController/loadHomeData.action' r = http_post(url2, headers=headers) json_info = json_loads(r.text) bill_info['card_num'] = json_info.get('cardNo') bill_info['due_date'] = json_info.get('dueDate') bill_info['repayment'] = json_info.get('stmtAmt') bill_info['min_repayment'] = json_info.get('minPay') bill_info['credit_limit'] = json_info.get('creditLimit') bill_info['cash_limit'] = json_info.get('cashLimit') bill_info['bill_date'] = json_info.get('closeDate') except Exception: pass result = {'bill_info': bill_info, 'bill_detail': []} return result
def parse_login(self, response): """ 登录数据解析 :param response: :return: """ meta = response.meta item = meta["item"] try: info = json_loads(response.text) data = info["datalist"][0] if data["jyjg"] != "1": msg = data["sbyy"] yield from self.error_handle(item["username"], msg, tell_msg=msg) return else: item["mobile"] = data.get("sjhm", "") item["private_no"] = data.get("gjjzh", "") item["real_name"] = data.get("xm", "") item["identification_number"] = data.get("zjh", "") item["identification_type"] = "身份证" # 请求缴存信息接口 self.logger.info("请求缴存信息接口->%s" % self.userdeposit_url) req_data = { "gjjzh": item["private_no"], "zjh": item["identification_number"], "qqly": "1001", "key": self.key, "version": self.version, } yield FormRequest(url=self.userdeposit_url, callback=self.parse_userdeposit, headers=self.headers, formdata=req_data, meta=meta, dont_filter=True, errback=self.err_callback) except Exception: yield from self.except_handle(item["username"], "广州公积金中心---登录数据解析异常")
def get_count_by_condition(self, court, date): """ 根据查询条件得到要查询条件对应的结果数量 :param court: :param date: :return: """ adapters.DEFAULT_RETRIES = 5 while True: sleep(0.5) # 每次请求至少间隔0.5秒 try: self.logger.info("current proxy->%s" % self.proxy) self.logger.info("get_count_by_condition->%s,%s" % (court, date)) data = { "Param": "法院名称:%s,裁判日期:%s" % (court, date), "Index": "1", "Page": "5", "Order": "法院层级", "Direction": "asc", } proxies = { "http": "http://%s" % self.proxy, } s = Session() s.keep_alive = False r = s.post(self.count_url, data=data, headers=self.headers, proxies=proxies, timeout=60) text = r.text.replace("\\", "").strip("\"") self.logger.info("response text->%s" % r.text) result = json_loads(text) count = result[0]["Count"] self.logger.info("get_count_by_condition,count->%s" % count) return int(count) except Exception as e: self.logger.info("get_count_by_condition,error->%s" % str(e)) if not isinstance(e, ProxyError): self.logger.info("sleep start!") sleep(5) # 不是代理出错的话间隔5秒重试 self.logger.info("sleep end!") self.proxy = self.proxy_api.get_proxy_one() # 更换代理
def parse_detail(self, response): meta = response.meta item = meta['item'] try: json_result = json_loads(response.text)['data']['body'] bill_record = self.get_bill_record(meta['bankname'], meta['subject'], json_result) bill_records = item['bill_records'] bill_records.append(bill_record) if meta['count'] == len(bill_records): yield from self.crawling_done( item, logout_request=self.get_logout_request(meta)) except Exception: yield item yield from self.except_handle( item['username'], msg="账单解析异常", tell_msg="账单解析异常", logout_request=self.get_logout_request(meta))
def parse_fee(self, response): """ 解析余额信息 """ text = response.text meta = response.meta item = meta["item"] try: if '"retCode":"000000"' in text: # 成功 datas = json_loads(text)["data"] curFeeTotal = float(datas.get("curFeeTotal", 0)) oweFee = float(datas.get("oweFee", 0)) curFee = float(datas.get("curFee", curFeeTotal)) item["balance"] = min(curFeeTotal, curFee) \ if curFeeTotal > 0 else min(curFeeTotal, curFee, -oweFee) self.logger.critical("curFeeTotal(%s) oweFee(%s) curFee(%s)" % (curFeeTotal, oweFee, curFee)) elif '"retCode":"570007"' in text: # 系统繁忙! yield from self._retry_request(response) return elif '"retCode":"500003"' in text: # session信息为空,请先登录! yield item yield from self.error_handle( item["username"], "移动---获取余额信息失败: (username:%s, password:%s) %s" % (item["username"], item["password"], text), "认证失败,请刷新页面重试。", logout_request=self.get_logout_request(meta)) return else: self.logger.error( "移动---获取余额信息失败:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) # 获取交费记录 yield self._get_historyPayment_request(response) except Exception: yield item yield from self.except_handle( item["username"], "移动---解析余额信息失败: %s" % text, logout_request=self.get_logout_request(meta))
def _parse_qrcode_login_result(self, response): """ 解析二维码登录结果 :return: """ meta = response.meta username = meta["item"]["username"] try: ret_data = json_loads(response.text) ret_url = ret_data.get("url", "") if "dangerousVerify" in ret_url: self.logger.info("账户存在安全风险,需要短信验证") yield from self._dangerous_verify_scrapy( username, ret_url, response) else: self.logger.info("扫描二维码登录成功") yield self._yield_order_request(response) except Exception: yield from self.except_handle(username, "解析二维码登录结果失败,登录失败")
def str_to_json(self, content, pattern=None, charset="utf-8"): """ 将str转化为json :param content: :param pattern: :param charset: :return: """ try: if pattern is None: pattern = self.reg_tar_str if isinstance(content, bytes): content = content.decode(charset) tar = pattern.search(content) if tar: return json_loads(tar.group(1)) return except Exception: return
def parse_detail(self, response): meta = response.meta item = meta['item'] try: the_data = json_loads(response.text)['data'] bill_record = self.get_bill_record(meta['bankname'], the_data['subject'], the_data['display']) bill_records = item['bill_records'] bill_records.append(bill_record) if len(bill_records) == meta['count']: # 如果匹配出来如果没有结果,说明有抓取到用户相关信息.直接返回完成.并退出 yield from self.crawling_done( item, logout_request=self.get_logout_request(meta)) except Exception: yield from self.except_handle( item['username'], "账单解析异常", tell_msg="账单解析异常", logout_request=self.get_logout_request(meta))
def parse(self, response): meta = response.meta item = meta['item'] try: info = item['password'].split("|", 1) item['password'] = '' qr_result_url = info[0] cookies = json_loads(info[1]) meta['retry_time'] = 0 yield Request(qr_result_url, callback=self.check_sig, meta=meta, cookies=cookies, dont_filter=True, errback=self.err_callback) # yield from self.qr_login(response) except Exception: yield from self.except_handle(item['username'], '登录入口解析失败', tell_msg="邮箱登录失败,请刷新页面重试")
def do_img_upload(): with Session() as _session: try: _session.headers[ "Referer"] = YINDING_HOST + "login/initLogin.goInit.do" _session.headers[ "User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0" _session.headers[ "Accept-Language"] = "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3" _session.post(YINDING_HOST + "login/initLogin.loginOut.do") data = { "userInfo.userName": "******", "userInfo.password": "******", } resp = _session.post( YINDING_HOST + "login/initLogin.validationUser.do", data, **KWARGS) if resp.status_code != 200: raise LoginFailedException("验证用户返回非200") json_data = json_loads(resp.text) if json_data["resultType"] != "SUCCESS": raise LoginFailedException(resp.text) data = { "userName": "******", "password": "******", } resp = _session.post( YINDING_HOST + "login/initLogin.loginSystem.do", data, **KWARGS) if resp.status_code != 200: raise LoginFailedException("登录返回非200") elif "退出登录" not in resp.text: raise LoginFailedException("登录失败") _do_img_upload(_session) finally: # 退出登录 resp = _session.post(YINDING_HOST + "login/initLogin.loginOut.do")
def parse_queryHistoryBill(self, response): """ 解析账单信息 """ text = response.text meta = response.meta item = meta["item"] try: if '"retCode":"000000"' in text: # 近6个月的账单都已经包含着数据里 item["history_bill"] = { bill["billMonth"]: { "all_fee": float(bill["billFee"]) } for bill in json_loads(text)["data"] } elif '"retCode":"570007"' in text: # 系统繁忙! yield from self._retry_request(response) return elif '"retCode":"500003"' in text: # session信息为空,请先登录! yield item yield from self.error_handle( item["username"], "移动---获取历史账单失败: (username:%s, password:%s) %s" % (item["username"], item["password"], text), "认证失败,请刷新页面重试。", logout_request=self.get_logout_request(meta)) return else: self.logger.error( "移动---获取历史账单失败:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) meta["sendSMSpwd_count"] = 1 yield self.casual_request(response) except Exception: yield item yield from self.except_handle( item["username"], "移动---解析历史账单失败: %s" % text, logout_request=self.get_logout_request(meta))
def parse_payment_list(self, response): """解析缴费记录""" text = response.text meta = response.meta item = meta["item"] username = item["username"] try: if '"resultCode":"0000"' in text: self.logger.info("[电信-" + username + "]: 获取缴费记录成功!") # 近6个月的交费记录都已经包含着数据里 datas = json_loads(text)["responseData"]["data"]["billingCycles"] history_payment_dic = {month: [] for month in get_months_str_by_number(6).split(",")} for payments in datas: if not payments["paymentDetails"]: continue for payment in payments["paymentDetails"]: pay_date = payment["stateDate"] history_payment_dic[payments["billCycle"]].append({ "time": pay_date[0:4] + "-" + pay_date[4:6] + "-" + pay_date[6:] + " 00:00:00", "channel": payment["payChannelId"], "fee": payment["paymentAmount"] }) item["history_payment"] = history_payment_dic # 处理完,返回爬取的结果 yield from self.crawling_done(item) else: yield item tell_msg = self.get_err_msg(text) yield from self.error_handle(username, "电信---获取缴费记录失败:(username:%s, password:%s) %s" % (username, item["password"], text), tell_msg=tell_msg) except Exception: yield item yield from self.except_handle(username, "电信---解析缴费记录失败: %s" % text)
def parse_userview(self, response): """ 缴存明细信息处理 :param self: :param response: :return: """ meta = response.meta item = meta["item"] try: info = json_loads(response.text) new_data = list() for data in info.get("datalist", []): data_dict = dict() data_dict["depmny"] = data["fse"] data_dict["acctime"] = data["ywrq"] data_dict["bustype"] = data["ywlxms"] new_data.append(data_dict) item["payment_detail"] = new_data # 请求贷款信息接口 self.logger.info("请求贷款信息接口->%s" % self.userloan_url) req_data = { "zjh": item["identification_number"], "qqly": "1001", "key": self.key, "version": self.version, } yield FormRequest(url=self.userloan_url, callback=self.parse_userloan, headers=self.headers, formdata=req_data, meta=meta, dont_filter=True, errback=self.err_callback) except Exception: yield from self.except_handle(item["username"], "广州公积金中心---缴存明细数据解析异常")
def xuexin_get_vcode(request): """ 获取手机验证码 https://account.chsi.com.cn/account/getmphonpincode.action post captch:MzxrYm mobilePhone:15908143404 optType:REGISTER ignoremphone:true :param request: :return: """ args = request.POST mphone = args.get("mphone", "") captch = args.get("captch", "") url = "https://account.chsi.com.cn/account/getmphonpincode.action" data = { "captch": captch, "mobilePhone": mphone, "optType": "REGISTER", "ignoremphone": "true", } req_cookie = request.session.get("req_cookie") response = http_post(url, data=data, headers=HEADERS, verify=False, cookies=req_cookie) text = response.text.replace("'", "\"") text = vcode_pattern.sub('', text) result = dict() result["result"] = json_loads(text) add_ajax_ok_json(result) return JsonResponse(result)
def parse(self, response): id = response.meta["id"] address = response.meta["address"] old_address = "%s|$|%s" % (id, address) ak = response.meta["ak"] try: data = json_loads(response.text) status = data["status"] if status == 0: item = GpsspgItem() item["id"] = id item["address"] = address item["lng"] = data["result"]["location"]["lng"] # 纬度值 item["lat"] = data["result"]["location"]["lat"] # 经度值 # item["precise"] = data["result"]["precise"] # 是否精确查找 # item["confidence"] = data["result"]["confidence"] # 可信度 # item["level"] = data["result"]["level"] # 地址类型 # print("id: %s, address: %s, lng: %s, lat: %s, precise: %s, confidence: %s, level: %s" % # (item["id"], item["address"], item["lng"], item["lat"], item["precise"], item["confidence"], item["level"])) # 查询到的结果存入到mongodb yield item elif status == 4 or status >= 300: self.logger.error("ak:%s, 当日请求超出配额,%s,%s" % (ak, address, id)) # ak超出当日限额,修改状态为不可用 self.set_ak(ak_key=ak, status=False) self.push_address_queue(old_address) else: self.logger.error("ak:%s, 请求出错,状态码:%s,%s,%s" % (ak, status, address, id)) self.push_address_queue(old_address) except Exception as e: self.logger.error("ak:%s, 处理出错,出错信息:%s,%s,%s" % (ak, str(e), address, id)) self.push_address_queue(old_address) request = self.get_request() if request: yield request
def load_and_store_train_data(captcha_store_directory, train_data_path, loop_count=1): with open(train_data_path, "r") as train_data_file: result = train_data_file.read() result = json_loads(result) size = len(result) + 1 for i in range(loop_count): resp = http_get("https://passport.jd.com/new/login.aspx") soup = BeautifulSoup(resp.text, "html.parser") auth_code_input = soup.select_one("#JD_Verification1") image_url = auth_code_input["src2"] resp = http_get("https:%s" % image_url) with Image.open(BytesIO(resp.content)) as image: knn_distance = get_knn_distance(image) knn_distance = str(float('%.2f' % knn_distance)) if knn_distance not in result: image.save("%s/captcha_%d.jpg" % (captcha_store_directory, size)) image.show() # 输入验证码 label = input("请输入验证码:") label = label.strip() result[knn_distance] = (knn_distance, label, "captcha_%d.jpg" % size) size += 1 with open(train_data_path, "w") as train_data_file: train_data_file.write(json_dumps(result)) # if __name__ == "__main__": load_and_store_train_data( "F:\work\公司文档\爬虫\京东验证码", "F:\software\pycharm\workspace\crawler\crawler_bqjr\crawler_bqjr\spiders\\b2c_ecommerce_spiders\\train_data.json", 20)
def parse(self, response): area_result = self.area_regex.search(response.text).group(1).replace( "//", "#").replace(";", "") area_result = self.replace_pattern.sub("", area_result) area_result = area_result.replace('name', '"name"') \ .replace('cityList', '"cityList"').replace('areaList', '"areaList"') area_result = json_loads(area_result) query_condition = self.query_condition for result in reversed(area_result): province = result["name"] for cityList in reversed(result["cityList"]): condition = dict() city = cityList["name"] condition["province"] = province condition["city"] = city condition["area"] = "" if not len(cityList["areaList"]): query_condition.append(condition) else: for area in reversed(cityList["areaList"]): area_condition = dict() area_condition["area"] = area query_condition.append( dict(condition, **area_condition)) # 根据地区条件组合查询 query = query_condition.pop() self.post_data["province"] = query["province"] self.post_data["city"] = query["city"] self.post_data["area"] = query["area"] self.current_condition = query request = FormRequest(self.post_url, self.parse_detail, formdata=self.post_data) # self.set_proxy(request) yield request
def parse_detail(self, response): result = json_loads(response.text) if result["ErrorMsg"] == "成功": for info in result["data"]["areaList"]: name = info["name"] the_id = info["cardNum"] key = name + the_id if self.is_search_name_exists(key): continue self.record_search_name(key) item = ShixinListItem() item["from_web"] = "kuaicha" item["name"] = name item["id"] = the_id yield item elif result["ErrorMsg"] == "已达到每日查询次数上限": # self.proxy = choice(self.proxy_list) self.query_condition.append(self.current_condition) self.logger.info("今日查询次数达到上限,明日再查") sleep_to_tomorrow() if self.query_condition: query = self.query_condition.pop() self.post_data["province"] = query["province"] self.post_data["city"] = query["city"] self.post_data["area"] = query["area"] self.current_condition = query request = FormRequest(self.post_url, self.parse_detail, formdata=self.post_data) # self.set_proxy(request) yield request else: self.logger.info("所有条件查询结束")
def _parse_order_info(self, response): """ 解析订单信息 :param response: :return: """ meta = response.meta item = meta["item"] orders = item.get("orders") if not orders: orders = [] try: json_page = json_loads(response.text) if not json_page.get("error"): page = json_page.get("page") curr_page = page.get("currentPage") total_page = page.get("totalPage") # total_num = page.get("totalNumber") main_orders = json_page.get("mainOrders", []) for one in main_orders: temp_dic = {} order_info = one.get("orderInfo", {}) pay_info = one.get("payInfo", {}) temp_dic["b2c"] = order_info.get("b2C") temp_dic["order_no"] = order_info.get("id") temp_dic["create_time"] = order_info.get("createTime") temp_dic["order_fee"] = pay_info.get("actualFee") temp_dic["post_type"] = pay_info.get("postType", "") temp_dic["shop_name"] = one.get("seller", {}).get("shopName") temp_dic["status"] = one.get("statusInfo", {}).get("text") # 获取子订单 sub_orders = one.get("subOrders", []) sub_order_list = [] for sub_order in sub_orders: temp_item = {} # 商品名 goods_name = sub_order.get("itemInfo", {}).get("title") temp_item["goods_name"] = goods_name # 数量 quantity = sub_order.get("quantity") temp_item["quantity"] = quantity sub_order_list.append(temp_item) temp_dic["sub_orders"] = sub_order_list # 获取商品物流信息 detail_url = "https://tradearchive.taobao.com/trade/detail/" \ "trade_item_detail.htm?bizOrderId={id}".format(id=temp_dic.get("order_no")) logistics = self._get_logistics_info( detail_url, cookies=meta["cookies"]) temp_dic["logistics"] = logistics orders.append(temp_dic) item["orders"] = orders if int(curr_page) != int(total_page) + 1: query_req = self.__generate_order_request( meta, page_num=int(curr_page) + 1) if query_req: yield query_req else: yield item yield from self.crawling_failed( item["username"], "生成请求订单信息失败") else: self.logger.info("---->所有信息抓取完成") yield from self.crawling_done(item) else: yield item yield from self.crawling_failed(item["username"], "请求订单信息失败") except Exception: yield item yield from self.except_handle(item["username"], "解析订单信息异常")
def _get_baidu_api_ret(url): req = Request(url) req.add_header('apikey', '72a0d84c0c95a75cd5aa8c1c3e698946') return json_loads(urlopen(req, timeout=ASK_TIMEOUT).read())
def parse_payment(self, response): """ 解析交费记录 """ text = response.text meta = response.meta item = meta["item"] try: this_month = meta["last_payment_month"] this_month_str = this_month.strftime("%Y%m") if '"pageMap":' in text: datas = json_loads(text)["pageMap"] the_payment_count = datas.get("totalCount", 0) if the_payment_count > self.CALL_PAGE_SIZE_LIMIT: self.logger.error("联通---单月交费次数过大:username:%s, " "password:%s, month:%s, totalCount:%d" % (item["username"], item["password"], this_month, the_payment_count)) payment_list = [{ "time": payment["paydate"], "fee": float(payment["payfee"]), "channel": payment["payment"], } for payment in datas["result"]] if len(payment_list) != the_payment_count: self.logger.error( "联通---交费次数异常:username:%s, password:%s, " "month:%s, totalCount:%d, payment_list_len:%d" % (item["username"], item["password"], this_month, the_payment_count, len(payment_list))) item["history_payment"][this_month_str] = payment_list elif 'respCode":"2114000283"' in text: # 无交费记录 item["history_payment"][this_month_str] = [] elif "payment_" + this_month_str + "_retry" not in meta: # 重发一次 self.logger.error("联通---重试交费记录:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) sleep(0.6) request = response.request.copy() request.meta["payment_" + this_month_str + "_retry"] = 1 yield request return else: self.logger.error( "联通---获取交费记录失败:(username:%s, password:%s, month:%s) %s" % (item["username"], item["password"], this_month, text)) # 继续获取另一个月的交费记录 if meta["payment_count"] < self.CALL_COUNT_LIMIT: # 只需要最近6个月的 last_month = get_last_month_from_date(this_month) yield self._get_payment_request(response, last_month) else: # 处理完,返回爬取的结果 yield from self.crawling_done(item) except Exception: yield item yield from self.except_handle(item["username"], "联通---解析交费记录失败: %s" % text)
def parse_msgDetail(self, response): """ 解析短信记录 """ text = response.text meta = response.meta item = meta["item"] try: this_month = meta["last_msg_month"] this_month_str = this_month.strftime("%Y%m") if '"pageMap":' in text: datas = json_loads(text)["pageMap"] the_msg_count = datas.get("totalCount", 0) if the_msg_count > self.CALL_PAGE_SIZE_LIMIT: self.logger.error("联通---单月短信次数过大:username:%s, " "password:%s, month:%s, totalCount:%d" % (item["username"], item["password"], this_month, the_msg_count)) msg_list = [{ "time": msg["smsdate"] + " " + msg["smstime"], "type": MsgType.Send if "2" == msg["smstype"] else MsgType.Receive, "other_num": msg["othernum"], } for msg in datas["result"]] if len(msg_list) != the_msg_count: self.logger.error( "联通---短信次数异常:username:%s, password:%s, " "month:%s, totalCount:%d, msg_list_len:%d" % (item["username"], item["password"], this_month, the_msg_count, len(msg_list))) item["history_msg"][this_month_str] = msg_list elif 'respCode":"2114030170"' in text: # 无记录 item["history_msg"][this_month_str] = [] elif "msg_" + this_month_str + "_retry" not in meta: # 重发一次 self.logger.error("联通---重试短信记录:(username:%s, password:%s) %s" % (item["username"], item["password"], text)) sleep(0.6) request = response.request.copy() request.meta["msg_" + this_month_str + "_retry"] = 1 yield request return else: self.logger.error( "联通---获取短信记录失败:(username:%s, password:%s, month:%s) %s" % (item["username"], item["password"], this_month, text)) # 继续获取另一个月的通话记录 if meta["msg_count"] < self.CALL_COUNT_LIMIT: # 只需要最近6个月的 last_month = get_last_month_from_date(this_month) yield self._get_msgDetail_request(response, last_month) else: # 获取过去6个月的账单信息 meta["bill_count"] = 0 item["history_bill"] = defaultdict(dict) yield self._get_historyBill_request(response, date.today()) except Exception: yield item yield from self.except_handle(item["username"], "联通---解析短信记录失败: %s" % text)