def parse_person_info_three(self, response):
        """
        解析个人信息3
        注:方法请求参数和app请求参数加密后不一致。经对比后也不能查找到不同点;
            猜测可能是加密/解密函数影响了参数显示。
        """
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        text = ""
        try:
            res_dict = self.dx_conver.convert_response_data(response.text)
            text = json_dumps(res_dict, ensure_ascii=False)

            if '"ResultCode":{"value":"0000"}' in text:
                self.logger.info("[电信-" + username + "]: 获取个人信息3成功!")

                datas = res_dict["Response"]["ResponseData"]["Data"]
                item["real_name"] = datas["Cust_Name"]["value"]
                item["status"] = UserStatus.Opened if datas["NumberStatus"] == "100000" else UserStatus.Shutdown

                form_data = {
                    "content": {
                        "fieldData": {
                            "accnbr": username,
                            "queryflag": "0",
                            "queryType": "0",
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryThisMonthBill",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/bill/queryThisMonthBill",
                              self.parse_this_month_bill, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
            else:
                yield item
                tell_msg = self.get_err_msg(text, True)
                yield from self.error_handle(username,
                                             "电信---获取个人信息3失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except Exception:
            yield item
            yield from self.except_handle(username, "电信---解析个人信息3失败: %s" % text)
Beispiel #2
0
 def exception_handle(self, condition, error_info):
     try:
         if self.name != "condition_spider":
             # script_name = "start_pc.sh" if self.name == "wenshu_pc_spider" else "start_app.sh"
             # 出现任何异常,再把出错的查询条件重新再加入到查询队列
             self.push_wenshu_condition(condition)
             self.logger.info("parse or parse_doc error->%s" %
                              str(error_info))
             # 判断接收到的内容是否为空,或者包含rtn等字样,如果有的话,则说明已经被服务器屏蔽了,暂停三分钟,继续尝试
             self.logger.info("sleep start!")
             sleep(5)  # 暂停5秒钟
             self.logger.info("sleep end!")
             # 更换代理
             self.proxy = self.proxy_api.get_proxy_one()  # 更换代理
             self.logger.error("request retry")
             # 重新请求当前条件
             request = Request(url=self.list_url,
                               method='POST',
                               callback=self.parse,
                               body=json_dumps(self.req_data),
                               headers=self.headers,
                               dont_filter=True,
                               errback=self.err_callback)
             self.set_proxy(request)
             yield request
     except Exception:
         self.exception_handle(condition, "change proxy error!")
Beispiel #3
0
 def push_query_condition_queue(self, condition):
     try:
         condition = json_dumps(self.dict_sorted(condition),
                                ensure_ascii=False)
         self.ssdb_conn.qpush_back(SSDB_WENSHU_CONDITION_QUEUE, condition)
     except Exception:
         return
 def _set_sms_captcha_headers_to_ssdb(self, username, token):
     """
     将当前的token放入ssdb中
     :param username: 用户名
     :param token   : 用户唯一凭证
     :return        : None
     """
     json_str = json_dumps({"token": token})
     self.set_sms_captcha_headers_to_ssdb(json_str, username)
def ask_qrcode_status(request):
    """
    获取扫描二维码状态
    :param request:
    :return:
    """
    ret_data = {}
    succ = False
    need_refresh = False
    try:
        args = request.POST
        username = args["username"]
        account_type = args["account_type"]
        lg_token = args.get("lg_token", "")
        check_url_base = "https://qrlogin.taobao.com/qrcodelogin/qrcodeLoginCheck.do?" \
                         "lgToken={lgToken}&defaulturl=https%3A%2F%2Fwww.taobao.com%2F"
        check_url = check_url_base.format(lgToken=lg_token)
        res_json = get_response_by_requests(check_url,
                                            headers=DEFAULT_HEADERS).json()

        session = req_session()
        msg = "通过扫描二维码登录失败"
        code = res_json.get("code")
        if code == "10000":
            msg = "请先扫描二维码"
        elif code == "10001":
            msg = "扫描成功后,请确认登录"
            succ = True
        elif code == "10004":
            msg = "二维码已失效,请重试"
            need_refresh = True
        elif code == "10006":
            redirect_url = res_json.get("url")
            resp = session.get(redirect_url,
                               headers=DEFAULT_HEADERS,
                               verify=False)
            if resp.status_code == 200:
                msg = "登录成功"
                cookies = session.cookies.get_dict(domain='.taobao.com')
                cookies_str = json_dumps(cookies)
                # 将登录成功的cookies信息存入ssdb,供爬虫端使用
                ssdb_connect = get_ssdb_conn()
                key = username + ACCOUNT_CRAWLING_QRCODE_COOKIES_SSDB_SUFFIX + account_type
                ssdb_connect.setx(key, cookies_str, DATA_EXPIRE_TIME)
                succ = True
        else:
            msg = res_json.get("msg", "通过扫描二维码登录失败")
    except Exception:
        msg = "获取扫描二维码状态出错"

    if succ:
        add_ajax_ok_json(ret_data)
    else:
        ret_data["need_refresh"] = need_refresh
        add_ajax_error_json(ret_data, msg)

    return JsonResponse(ret_data)
def _send_taobao_alicloudapi(url, b64_pic):
    data = {'img': b64_pic,
            'prob': 'false'
            }
    headers = {'Authorization': 'APPCODE ' + choice(ALICLOUDAPI_APPCODE_LIST),
               }

    response = http_post(url, data=json_dumps(data), headers=headers)
    data = json_loads(response.text)
    return [i["word"] for i in data["prism_wordsInfo"]] if "prism_wordsInfo" in data else None
Beispiel #7
0
 def record_query_condition(self, condition, status=0):
     try:
         condition = json_dumps(self.dict_sorted(condition),
                                ensure_ascii=False)
         item = {
             "condition": condition,
             "status": status,
         }
         self.mongo_instance.insertOne(item)
     except Exception:
         return
Beispiel #8
0
    def _dangerous_verify_scrapy(self, username, verify_url, response):
        """
        登录安全校验(先发送短信验证码,再提交校验)
        :param username:
        :param verify_url:
        :param response:
        :return:
        """
        try:
            headers_data = json_dumps({"url": verify_url})
            self.set_sms_captcha_headers_to_ssdb(headers=headers_data,
                                                 username=username)
            self.logger.info("等待获取用户输入短信验证码中...")
            sms_code_data = self.ask_sms_captcha(username)

            if not sms_code_data:
                msg = "获取用户输入短信验证码失败,登录失败"
                yield from self.error_handle(username, msg)
            else:
                tmp_data = sms_code_data.split("_")
                sms_code, ret_key = tmp_data if len(tmp_data) == 2 else ("",
                                                                         "")
                self.logger.info("%s ---> sms_code:%s" % (username, sms_code))
                # 获取用户指纹信息eid,fp暂未实现,获取方式:https://payrisk.jd.com/js/td.js
                eid, fp = ("", "")
                valid_url = "https://safe.jd.com/dangerousVerify/checkDownLinkCode.action" \
                            "?code={code}&k={k}&t={stime}&eid={eid}" \
                            "&fp={fp}".format(code=sms_code, k=ret_key, stime=get_js_time(), eid=eid, fp=fp)
                my_headers = self.headers.copy()
                my_headers.update({
                    "Accept":
                    "application/json, text/javascript, */*; q=0.01",
                    "Referer":
                    verify_url,
                    "Host":
                    "safe.jd.com",
                    "Connection":
                    "keep-alive",
                    "X-Requested-With":
                    "XMLHttpRequest",
                    "User-Agent":
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
                })
                yield Request(url=valid_url,
                              headers=my_headers,
                              callback=self._parse_verify_result,
                              meta=response.meta,
                              dont_filter=True,
                              errback=self.err_callback)
        except CaptchaTimeout:
            yield from self.error_handle(username, "获取短信验证码超时,登录失败")
        except Exception:
            msg = "进行安全校验出错:%s" % username
            yield from self.except_handle(username, msg)
Beispiel #9
0
 def rabbitmq_sender(self, queue, item_dict):
     """
     对保存mq的内容进行gzip压缩和base64位编码
     :param queue: 队列名
     :return:
     """
     del item_dict["_id"]
     content = b64encode(compress(
         json_dumps(item_dict).encode("utf-8"))).decode("utf-8")
     with RabbitmqSender(queue=queue,
                         exchange=RABBITMQ_EXCHANGE,
                         durable=True) as rs:
         rs.send(content)
def _get_telecom_bills_sms_captcha(args):
    """电信发送一般短信验证码"""
    ret_data = {}
    username = args["username"].strip()
    dx_conver = DXConvertData()

    url = "http://cservice.client.189.cn:8004/map/clientXML?encrypted=true"
    key = username + ACCOUNT_CRAWLING_SMS_HEADERS_SSDB_SUFFIX + args["account_type"]
    try:
        ssdb_conn = get_ssdb_conn()
        headers = ssdb_conn.get(key)
        if headers is not None:
            token = json_loads(headers)["token"]
            form_data = {
                "Request": {
                    "HeaderInfos": {
                        "ClientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                        "Source": "110003",
                        "SourcePassword": "******",
                        "Token": token,
                        "UserLoginName": username,
                        "Code": "getRandomV2",
                        "Timestamp": strftime("%Y%m%d%H%M%S"),
                    },
                    "Content": {
                        "Attach": "test",
                        "FieldData": {
                            "PhoneNbr": username,
                            "SceneType": "7",
                            "Imsi": {}
                        }
                    }
                }
            }
            form_str = dx_conver.convert_request_data(form_data)
            sms_text = http_post(url, headers=CSERVICE_HEADERS, data=form_str, verify=False).text

            sms_dict = dx_conver.convert_response_data(sms_text)
            sms_str = json_dumps(sms_dict, ensure_ascii=False)
            if '"ResultCode":{"value":"0000"}' in sms_str:
                add_ajax_ok_json(ret_data)
            elif "服务中断" in sms_text:
                add_ajax_error_json(ret_data, "电信服务中断,请稍后再试!")
            else:
                add_ajax_error_json(ret_data, "发送失败:" + sms_str)
        else:
            add_ajax_error_json(ret_data, "无法获取短信验证码,请刷新页面重试!")
    except Exception:
        add_ajax_error_json(ret_data, "无法获取短信验证码,请重试。")

    return JsonResponse(ret_data)
    def parse_this_month_bill(self, response):
        """解析当前月份话费账单"""
        text = response.text
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        try:
            if '"resultCode":"0000"' in text:
                self.logger.info("[电信-" + username + "]: 获取当前月份话费账单成功!")

                datas = json_loads(text)["responseData"]["data"]
                item["history_bill"] = defaultdict(dict)
                item["history_bill"][strftime('%Y%m')] = {"all_fee": datas["sumCharge"]}
                # 用户名
                item["real_name"] = datas["accNbrDetail"]

                form_data = {
                    "content": {
                        "fieldData": {
                            "accnbr": username,
                            "queryflag": "",
                            "queryType": "0",
                            "billingcycle": get_months_str_by_number(6, False)
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryBill",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/bill/queryBill",
                              self.parse_bill_list, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
            else:
                yield item
                tell_msg = self.get_err_msg(text)
                yield from self.error_handle(username,
                                             "电信---获取当前月份话费账单失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except Exception:
            yield item
            yield from self.except_handle(username, "电信---解析当前月份话费账单失败: %s" % text)
    def parse_balance(self, response):
        """
        解析余额
        """
        text = response.text
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        try:
            if '"resultCode":"0000"' in text:
                self.logger.info("[电信-" + username + "]: 获取余额成功!")

                datas = json_loads(text)["responseData"]["data"]
                if datas:
                    item["balance"] = datas["totalBalance"]

                form_data = {
                    "content": {
                        "fieldData": {
                            "accnbr": username,
                            "phoneType": meta["phoneType"],
                            "starGrade": "11",
                            "shopId": "20002"
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryPersonalInfo",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/personal/queryPersonalInfo",
                              self.parse_person_info_one, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
            else:
                tell_msg = self.get_err_msg(text)
                yield from self.error_handle(username,
                                             "电信---获取余额失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except Exception:
            yield from self.except_handle(username, "电信---解析余额失败: %s" % text)
    def parse_bill_list(self, response):
        """解析最近6个月份话费账单(不包含本月)"""
        text = response.text
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        try:
            if '"resultCode":"0000"' in text:
                self.logger.info("[电信-" + username + "]: 获取最近6个月份话费账单成功!")

                datas = json_loads(text)["responseData"]["data"]["chargeEntities"]
                item["history_bill"].update({
                    bill["month"]: {"all_fee": bill["sumCharge"] if bill["sumCharge"] else "0.0"}
                    for bill in datas})

                form_data = {
                    "content": {
                        "fieldData": {
                            "accnbr": username,
                            "billingcycle": get_months_str_by_number(6)
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryCallRecharge",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/payMent/queryCallRecharge",
                              self.parse_payment_list, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
            else:
                yield item
                tell_msg = self.get_err_msg(text)
                yield from self.error_handle(username,
                                             "电信---获取最近6个月份话费账单失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except Exception:
            yield item
            yield from self.except_handle(username, "电信---解析最近6个月份话费账单失败: %s" % text)
Beispiel #14
0
 def is_query_condition_exists(self, condition):
     try:
         condition = json_dumps(self.dict_sorted(condition),
                                ensure_ascii=False)
         result = self.mongo_instance.getOne(
             filter={"condition": condition},
             fields={
                 "condition": 1,
                 "status": 1,
                 "_id": 0
             })
         if result:
             return True
     except Exception:
         pass
     return False
Beispiel #15
0
    def start_requests(self):
        # 重置请求
        self.reset_req()

        try:
            request = Request(
                url=self.list_url,
                method='POST',
                callback=self.parse,
                body=json_dumps(self.req_data),
                headers=self.headers,
                dont_filter=True,
                errback=self.err_callback
            )
            self.set_proxy(request)
            yield request
        except Exception:
            self.exception_handle(self.condition, "start_requests error")
def hanvon_alicloudapi(pic):
    """识别率70%"""
    host = 'http://text.aliapi.hanvon.com'
    path = '/rt/ws/v1/ocr/text/recg'
    appcode = 'd6147d2ef06e4ce09ce029cae877daca'
    querys = 'code=74e51a88-41ec-413e-b162-bd031fe0407e'
    url = host + path + '?' + querys

    data = {'uid': "118.12.0.12",
            "lang": "chns",
            "color": "black",
            'image': pic
            }
    headers = {'Authorization': 'APPCODE ' + appcode,
               'Content-Type': 'application/json; charset=UTF-8',
               }

    response = http_post(url, data=json_dumps(data), headers=headers)
    return response.text
def hanvon_table_alicloudapi(pic):
    """识别率70%"""
    host = 'http://table.aliapi.hanvon.com'
    path = '/rt/ws/v1/ocr/table/text/recg'
    appcode = 'd6147d2ef06e4ce09ce029cae877daca'
    querys = 'code=0d3b7d23-915a-4c6f-9886-6312440aba51'
    url = host + path + '?' + querys

    data = {'uid': "118.12.0.12",
            "lang": "chns",
            "color": "black",
            'image': pic
            }
    headers = {'Authorization': 'APPCODE ' + appcode,
               'Content-Type': 'application/json; charset=UTF-8',
               }

    response = http_post(url, data=json_dumps(data), headers=headers)
    return response.text
Beispiel #18
0
def load_and_store_train_data(captcha_store_directory,
                              train_data_path,
                              loop_count=1):
    with open(train_data_path, "r") as train_data_file:
        result = train_data_file.read()
        result = json_loads(result)

    size = len(result) + 1
    for i in range(loop_count):
        resp = http_get("https://passport.jd.com/new/login.aspx")
        soup = BeautifulSoup(resp.text, "html.parser")
        auth_code_input = soup.select_one("#JD_Verification1")
        image_url = auth_code_input["src2"]
        resp = http_get("https:%s" % image_url)
        with Image.open(BytesIO(resp.content)) as image:
            knn_distance = get_knn_distance(image)
            knn_distance = str(float('%.2f' % knn_distance))

            if knn_distance not in result:
                image.save("%s/captcha_%d.jpg" %
                           (captcha_store_directory, size))
                image.show()
                # 输入验证码
                label = input("请输入验证码:")
                label = label.strip()
                result[knn_distance] = (knn_distance, label,
                                        "captcha_%d.jpg" % size)
                size += 1

    with open(train_data_path, "w") as train_data_file:
        train_data_file.write(json_dumps(result))

    # if __name__ == "__main__":
    load_and_store_train_data(
        "F:\work\公司文档\爬虫\京东验证码",
        "F:\software\pycharm\workspace\crawler\crawler_bqjr\crawler_bqjr\spiders\\b2c_ecommerce_spiders\\train_data.json",
        20)
    def login(self, response):
        """
        电信App登录
        """
        meta = response.meta
        item = meta["item"]
        item["brand"] = "电信"
        username = item["username"]
        password = item["password"]

        form_data = {
            "content": {
                "fieldData": {
                    "accountType": "c2000004",
                    "phoneNum": username,
                    "isChinatelecom": "0",
                    "systemVersion": "4.4.4",
                    "authentication": password,
                    "deviceUid": "860096537016542",
                    "loginType": "4"
                },
                "attach": "test"
            },
            "headerInfos": {
                "timestamp": strftime("%Y%m%d%H%M%S"),
                "code": "loginNormal",
                "source": "110003",
                "token": "null",
                "userLoginName": username,
                "sourcePassword": "******",
                "clientType": "#6.2.1#channel29#Huawei DUK-AL20#"
            }
        }
        yield Request("https://appgo.189.cn:8443/login/normal", self.parse_login, "POST",
                      self.appgo_headers, json_dumps(form_data), meta=meta, dont_filter=True,
                      errback=self.err_callback)
Beispiel #20
0
    def _login(self, response):
        """
        进行登录
        :param response:
        :return:
        """
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        try:
            base_url = response.urljoin("/uc/loginService")
            uuid = self.get_value_by_name(response, "uuid")
            pubKey = self.get_value_by_name(response, "pubKey")
            seqSid = ''

            login_post_url = "%s?uuid=%s&ReturnUrl=%s&r=%s&version=2015" \
                             % (base_url, uuid, "https%3A%2F%2Fwww.jd.com%2F", str(random()))

            # 密码为rsa加密
            rsa = RsaUtil(key_is_hex=False)
            encode_pwd = rsa.encrypt(item["password"], pubkey=pubKey)

            temp_cookies = {}
            for c in response.headers.getlist('Set-Cookie', []):
                temp_cookies.update(
                    dict(kv.strip().split("=", 1)
                         for kv in c.decode().split(";") if "=" in kv))
            meta["cookies"] = temp_cookies
            authcode = ""
            capthca_div = response.xpath(
                '//div[@id="o-authcode"][@style="display: block;"]'
            ).extract_first()
            if capthca_div:
                need_captcha = True
            else:
                auth_url = 'https://passport.jd.com/uc/showAuthCode?r=%s&version=2015' % str(
                    random())
                auth_page = self.http_request(auth_url,
                                              method="POST",
                                              cookies=temp_cookies,
                                              headers=self.headers,
                                              data={"loginName": username})
                need_captcha = ('verifycode":true' in auth_page)
            code_url = response.xpath(
                '//img[@id="JD_Verification1"]/@src2').extract_first()
            if need_captcha and code_url:
                self.logger.info("需要输入验证码")
                new_headers = self.authcode_headers
                code_url = "https:" + code_url if not code_url.startswith(
                    "http") else code_url
                code_body = self.http_request(code_url,
                                              headers=new_headers,
                                              get_str=False)
                # 将请求头等数据存入ssdb,方便刷新图片验证码
                ssdb_headers_data = json_dumps({
                    "headers": new_headers,
                    "uuid": uuid
                })
                self.set_image_captcha_headers_to_ssdb(
                    headers=ssdb_headers_data, username=username)
                authcode = self.ask_image_captcha(code_body, username)
                self.logger.info("验证码:%s" % authcode)

            login_post_data = {
                "uuid": uuid,
                "eid": self.get_value_by_name(response, "eid"),
                "fp": self.get_value_by_name(response, "fp"),
                "_t": self.get_value_by_name(response, "_t"),
                "loginType": self.get_value_by_name(response, "loginType"),
                "loginname": username,
                "nloginpwd": encode_pwd,
                "chkRememberMe": "",
                "authcode": authcode,
                "pubKey": pubKey,
                "sa_token": self.get_value_by_name(response, "sa_token"),
                "seqSid": seqSid or ''
            }
            self.logger.debug(login_post_data)

            yield FormRequest(url=login_post_url,
                              headers=self.headers,
                              cookies=meta["cookies"],
                              callback=self._parse_login_status,
                              meta=meta,
                              formdata=login_post_data,
                              dont_filter=True,
                              errback=self.err_callback)
        except BadCaptchaFormat:
            yield from self.error_handle(username, "获取验证码图片失败")
        except Exception:
            yield from self.except_handle(username, "准备登录参数异常")
Beispiel #21
0
def store_train_data(data, filepath):
    with open(filepath, "w") as train_data_file:
        train_data_file.write(json_dumps(data))
    def parse_person_info_two(self, response):
        """
        解析个人信息2
        """
        meta = response.meta
        item = meta["item"]
        login_token = meta["login_token"]
        username = item["username"]

        text = ""
        try:
            res_dict = self.dx_conver.convert_response_data(response.text)
            text = json_dumps(res_dict, ensure_ascii=False)

            if '"ResultCode":{"value":"0000"}' in text:
                self.logger.info("[电信-" + username + "]: 获取个人信息2成功!")

                datas = res_dict["Response"]["ResponseData"]["Data"]
                basic_info = datas["BasicInfo"]
                item["is_real_name"] = (datas["Authenticate"]["value"] == "true")
                item["sex"] = Sex.Male if basic_info["Sex"]["value"] == "0" else Sex.Female
                item["identification_number"] = basic_info["IdCardNo"]["value"]
                item["contact_addr"] = basic_info["Address"]["value"]

                # 个人信息3暂时不使用
                # form_data = {
                #     "Request": {
                #         "HeaderInfos": {
                #             "ClientType": "#6.2.1#channel8#samsung SM-N935F#",
                #             "Source": "110003",
                #             "SourcePassword": "******",
                #             "Token": login_token,
                #             "UserLoginName": username,
                #             "Code": "custInfo",
                #             "Timestamp": strftime("%Y%m%d%H%M%S")
                #         },
                #         "Content": {
                #             "Attach": "test",
                #             "FieldData": {
                #                 "PhoneNbr": username
                #             }
                #         }
                #     }
                # }
                # from_str = self.dx_conver.convert_request_data(form_data)
                # yield Request("http://cservice.client.189.cn:8004/map/clientXML?encrypted=true",
                #               self.parse_person_info_three, "POST", self.cservice_headers, from_str,
                #               meta=meta, dont_filter=True, errback=self.err_callback)

                # 查询通话记录需要短信验证码、身份证号码、姓名
                # self._set_sms_captcha_headers_to_ssdb(username, login_token)
                # uid = self.need_name_idcard_sms_captcha_type(username)
                # sms_str = self.ask_captcha_code(uid)
                # sms_arr = sms_str.split("_")
                # sms_code, name, id_card,  = sms_arr[0], sms_arr[1], sms_arr[2]
                # item["is_real_name"] = True
                # item["real_name"] = name
                # item["identification_number"] = id_card

                # 电信App bug,可以绕过身份证和姓名的检验,只需要短信验证码
                self._set_sms_captcha_headers_to_ssdb(username, login_token)
                uid = self.need_sms_captcha_type(username)
                sms_code = self.ask_captcha_code(uid)

                item["history_call"] = {}
                meta["call_count"] = 0
                yield self.get_call_detail(response, sms_code, date.today())
            else:
                yield item
                tell_msg = self.get_err_msg(text, True)
                yield from self.error_handle(username,
                                             "电信---获取个人信息2失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except CaptchaTimeout:
            yield item
            yield from self.error_handle(username, "电信---解析验证码失败,等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。")
        except Exception:
            yield item
            yield from self.except_handle(username, "电信---解析个人信息2失败: %s" % text)
    def parse_login(self, response):
        """
        解析登录
        """
        text = response.text
        meta = response.meta
        item = meta["item"]
        username = item["username"]

        try:
            if '"resultCode":"0000"' in text:
                self.logger.info("[电信-" + username + "]: 登录成功!")
                datas = json_loads(text)["responseData"]["data"]["loginSuccessResult"]

                if item["city"] == "":
                    item["city"] = datas["cityName"]
                meta.setdefault("login_token", datas["token"])
                meta.setdefault("phoneType", datas["phoneType"])

                form_data = {
                    "content": {
                        "fieldData": {
                            "queryflag": "",
                            "payflag": "0",
                            "destinationid": username,
                            "shopId": "20002"
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryExpense",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#samsung SM-N935F#"
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/balance/queryExpense",
                              self.parse_balance, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
            elif '"resultCode":"3001"' in text:
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg="账号或密码错误!")
            elif '"resultCode":"3002"' in text:
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg="该手机号还未进行注册!")
            elif '"resultCode":"8105"' in text:
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg="弱密码,请重置服务密码后登录!")
            elif '"code":"X102"' in text:
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg="设备机型不能为空!")
            elif text == "":
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], "电信服务器异常"),
                                             tell_msg="电信服务器异常,请稍后再试!")
            else:
                tell_msg = self.get_err_msg(text)
                yield from self.error_handle(username,
                                             "电信---登录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], text),
                                             tell_msg=tell_msg)
        except Exception:
            yield from self.except_handle(username, "电信---解析登录失败: %s" % text)
def telecom_bills_validation(request):
    """电信账单验证"""
    ret_data = {}
    args = request.POST
    username = args["username"].strip()
    sms_captcha = args["sms_captcha"]
    name = args["name"]
    id_card = args["id_card"]
    dx_conver = DXConvertData()

    url = "http://cservice.client.189.cn:8004/map/clientXML?encrypted=true"
    key = username + ACCOUNT_CRAWLING_SMS_HEADERS_SSDB_SUFFIX + args["account_type"]
    try:
        ssdb_conn = get_ssdb_conn()
        headers = ssdb_conn.get(key)
        if headers is not None:
            token = json_loads(headers)["token"]
            form_data = {
                "Request": {
                    "HeaderInfos": {
                        "ClientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                        "Source": "110003",
                        "SourcePassword": "******",
                        "Token": token,
                        "UserLoginName": username,
                        "Code": "randomCodeAndAuthValidate",
                        "Timestamp": strftime("%Y%m%d%H%M%S"),
                    },
                    "Content": {
                        "Attach": "test",
                        "FieldData": {
                            "ShopId": "20002",
                            "IdCardNum": id_card,
                            "RandomCode": sms_captcha,
                            "PhoneNum": username,
                            "Username": name,
                            "ValidateType": "1"
                        }
                    }
                }
            }
            form_str = dx_conver.convert_request_data(form_data)
            res_content = http_post(url, headers=CSERVICE_HEADERS, data=form_str, verify=False)

            # 返回数据解密
            res_dict = dx_conver.convert_response_data(res_content.text)
            res_str = json_dumps(res_dict, ensure_ascii=False)

            if '"ResultCode": {"value": "0000"}' in res_str:
                add_ajax_ok_json(ret_data)
            elif '"ResultCode": {"value": "0001"}' in res_str:
                add_ajax_error_json(ret_data, "非实名制用户")
            elif '"ResultCode": {"value": "9152"}' in res_str:
                add_ajax_error_json(ret_data, "验证码错误!")
            else:
                add_ajax_error_json(ret_data, "验证失败,请重试:" + res_str)
        else:
            add_ajax_error_json(ret_data, "验证失败,没有用户信息,请刷新重试。")
    except Exception:
        add_ajax_error_json(ret_data, "验证失败,请重试。")

    return JsonResponse(ret_data)
Beispiel #25
0
    def parse(self, response):
        self.exception_response(self.condition, response)
        self.logger.info("list_req_data->%s" % self.req_data)
        self.logger.info("skip->%s" % self.start_index)
        self.logger.info("parse_response->%s" % response.text)

        try:
            # 接收到数据先base64解码,再aes解密,并转换为字符串
            text = decrypt(b64decode(response.text)).decode("utf-8")
            # 将最后]出现的位置之后的字符全部过滤掉
            text = text[:text.rfind(']') + 1]
            docs = json_loads(text)
            self.logger.info("list->%s:%s" % (str(len(docs)), text))

            for doc in docs:
                item = WenshuItem()
                item["case_type"] = doc.get("案件类型", "")
                item["sentence_date"] = doc.get("裁判日期", "")
                item["case_name"] = doc.get("案件名称", "")
                item["file_id"] = doc.get("文书ID", "")
                item["trial_procedure"] = doc.get("审判程序", "")
                item["case_no"] = doc.get("案号", "")
                item["court_name"] = doc.get("法院名称", "")
                item["relation"] = doc.get("关联文书", "")
                # 文书ID为空则跳过
                if not item["file_id"] or self.is_wenshu_id_exists(item["file_id"]):
                    self.logger.info("%s has saved!continue!" % item["file_id"])
                    continue
                req_data = {
                    "fileId": item["file_id"],  # 文书ID
                    "reqtoken": get_reqtoken()  # 请求token
                }
                self.logger.info("doc_req_data->%s" % req_data)
                request = Request(
                    url=self.doc_url,
                    method='POST',
                    callback=self.parse_doc,
                    body=json_dumps(req_data),
                    meta={"item": item},
                    headers=self.headers,
                    dont_filter=True,
                    errback=self.err_callback
                )
                self.set_proxy(request)
                yield request

            # 让数据起始值加分页大小,好下一次请求可以请求到下一页的数据,
            self.start_index += self.page_size
            # 记录查询条件的爬取状态(已经爬取过的状态改为1)
            self.record_query_condition(self.dict_sorted(self.condition), 1)

            # 查询结果至少第一页应该有数据,否则就可能是代理的问题
            if not docs and self.start_index <= 20:
                self.logger.debug("查询结果至少第一页应该有数据,否则就可能是代理的问题")
                self.push_wenshu_condition(self.condition)
                self.proxy = self.proxy_api.get_proxy_one()  # 更换代理
                self.reset_req()

            # 如果没有数据或者start_index大于220了则读取下一个查询条件并改变查询的条件,且让start_index变为0
            if not docs or self.start_index > 220:
                # 重置请求
                self.reset_req()
            self.req_data["skip"] = str(self.start_index)
            request = Request(
                url=self.list_url,
                method='POST',
                callback=self.parse,
                body=json_dumps(self.req_data),
                headers=self.headers,
                dont_filter=True,
                errback=self.err_callback
            )
            self.set_proxy(request)
            yield request
        except Exception:
            self.exception_handle(self.condition, "parse error")
    def generate_query(self):
        """
        生成查询条件并存入到查询队列
        :return:
        """
        self.logger.info("query condition init begin!")

        # 查询法院(查询条件)
        mongo_instance = MongoDB(MONGO_WENSHU_DB, MONGO_CHINACOURT_COLLECTIONS)
        # 设置游标不超时
        cursor = mongo_instance.getAll(fields={
            "_id": 1,
            "name": 1
        },
                                       sort=[("province", MONGO_ASCENDING)],
                                       no_cursor_timeout=True)
        court_list = [court["name"] for court in cursor]
        # 案件类型
        case_type_list = ["1", "2", "3", "4", "5"]
        for court in court_list:
            count = 1
            avg_interval = 0  # 当数量很大的时候直接使用总数/220的数字来代替间隔天数
            avg_interval_first = 0
            start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
            end_date = datetime.strptime(self.end_date, "%Y-%m-%d")
            while True:
                divisor = (count**2) if count != 2 else 2
                # 平均间隔天数
                interval_day = avg_interval if avg_interval > 0 else ceil(
                    (end_date - start_date).days / divisor)
                if avg_interval_first > 0:
                    avg_interval = avg_interval_first
                    avg_interval_first = 0
                self.logger.info("interval_day->%s" % interval_day)

                # 生成查询时间段
                end_date_temp = min(start_date + timedelta(days=interval_day),
                                    end_date)
                query_date = "%s TO %s" % (start_date.strftime("%Y-%m-%d"),
                                           end_date_temp.strftime("%Y-%m-%d"))
                self.logger.info("query_date->%s!" % query_date)
                query_condition = dict()
                query_condition["case_type"] = "0"  # 所有类型
                query_condition["court"] = court
                query_condition["date"] = query_date
                if self.is_query_condition_exists(query_condition):
                    if end_date == end_date_temp:
                        self.logger.info("%s query_condition exists!break!" %
                                         court)
                        break
                    else:
                        start_date = end_date_temp + self.one_day
                        self.logger.info(
                            "%s query_condition exists!continue!" %
                            json_dumps(query_condition))
                        continue
                # 查询到数量小于等于220的加到小于220的列表中,并跳出该循环
                query_count = self.get_count_by_condition(court=court,
                                                          date=query_date)
                if 0 <= query_count <= 220:
                    if query_count > 0:
                        self.record_query_condition(query_condition)
                        self.push_query_condition_queue(query_condition)
                    # 查询结果为0,只保存到mongo并且状态为-1
                    if query_count == 0:
                        self.record_query_condition(query_condition, -1)
                    if end_date == end_date_temp:
                        if count > 1:
                            # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1
                            init_date = "%s TO %s" % (self.start_date,
                                                      self.end_date)
                            query_condition["date"] = init_date
                            self.record_query_condition(query_condition, -1)
                        self.logger.info("%s query condition end!" % court)
                        break
                    else:
                        start_date = end_date_temp + self.one_day
                else:
                    if count > 1:
                        avg_interval_first = avg_interval
                    temp_days = (end_date_temp - start_date).days
                    try:
                        avg_interval = int(180 /
                                           (int(query_count) / temp_days))
                    except ZeroDivisionError:
                        self.logger.exception("爬取出错,出错原因:")
                        break
                    # 如果间隔时间都为1天查询到的结果还大于220的话,则在保存条件的时候再增加案件类型进行保存
                    if temp_days == 1:
                        for case_type in case_type_list:
                            query_condition["case_type"] = case_type
                            if not self.is_query_condition_exists(
                                    query_condition):
                                self.record_query_condition(query_condition)
                                self.push_query_condition_queue(
                                    query_condition)
                        if end_date == end_date_temp:
                            if count > 1:
                                # 每个法院轮询生成查询条件的日期也放到mongodb,状态为-1
                                init_date = "%s TO %s" % (self.start_date,
                                                          self.end_date)
                                query_condition["date"] = init_date
                                self.record_query_condition(
                                    query_condition, -1)
                            self.logger.info("%s query condition end!" % court)
                            break
                        else:
                            start_date = end_date_temp + self.one_day
                count += 1
        self.logger.info("query condition init end!")
    def parse_call_list(self, response):
        """解析通话记录"""
        meta = response.meta
        item = meta["item"]
        sms_code = meta["sms_code"]
        username = item["username"]

        res_text = ""
        try:
            this_month = meta["last_call_month"]
            this_month_str = this_month.strftime("%Y%m")
            res_dict = self.dx_conver.convert_response_data(response.text)
            res_text = json_dumps(res_dict, ensure_ascii=False)

            if '"ResultCode":{"value":"0000"}' in res_text:
                self.crawling_login(username)  # 通知授权成功

                self.logger.info("[电信-" + username + "]: 获取-" + this_month_str + "-通话记录成功!")

                item["history_call"][this_month_str] = []
                datas = res_dict["Response"]["ResponseData"]["Data"]
                if datas:
                    call_arr = datas["Items"]["Item"]
                    if isinstance(call_arr, list):
                        item["history_call"][this_month_str] = [{
                            "time": data["CallTime"]["value"],
                            "duration": data["CallTimeCost"]["value"],
                            "type": "1" if data["CallType"]["value"] == "0" else "0",
                            "other_num": data["CallMobile"]["value"],
                            "my_location": data["CallArea"]["value"],
                            "fee": data["CallFee"]["value"]
                        } for data in call_arr]
                    else:
                        item["history_call"][this_month_str] = [{
                            "time": call_arr["CallTime"]["value"],
                            "duration": call_arr["CallTimeCost"]["value"],
                            "type": "1" if call_arr["CallType"]["value"] == "0" else "0",
                            "other_num": call_arr["CallMobile"]["value"],
                            "my_location": call_arr["CallArea"]["value"],
                            "fee": call_arr["CallFee"]["value"]
                        }]
            elif '"ResultCode":{"value":"9152"}' in res_text:
                # 短信验证码超时,需要重新发送验证码
                self.logger.info("[电信]短信验证码超时,请求重新发送短信验证码!")
                self._set_sms_captcha_headers_to_ssdb(username, meta["login_token"])
                sms_uid = self.need_sms_captcha_type(username)
                sms_code = self.ask_captcha_code(sms_uid)
                meta["call_count"] -= 1
                yield self.get_call_detail(response, sms_code, this_month)
                return
            elif '"ResultCode":{"value":"8204"}' in res_text:
                yield item
                yield from self.error_handle(username,
                                             "电信---获取通话记录失败:(username:%s, password:%s) %s"
                                             % (username, item["password"], res_text),
                                             tell_msg="此账号不支持通话记录查询,请联系电信运营商。")
                return
            elif meta.get("call_" + this_month_str + "_retry", 0) < self.retry_call_times:
                if '"ResultCode":{"value":"1"}' in res_text:
                    self.logger.error("电信---获取通话记录失败:服务器异常!")
                elif response.text == "":
                    self.logger.error("电信---获取通话记录失败:服务器返回值为空!")
                else:
                    self.logger.error("电信---获取通话记录失败:{}".format(res_text))
                yield self.retry_call_list(response, res_text)
                return
            else:
                self.logger.error("电信---获取通话记录失败:(username:%s, password:%s) %s"
                                  % (username, item["password"], res_text))

            # 继续请求下一个月通话记录
            if meta["call_count"] < self.CALL_COUNT_LIMIT:
                last_month = get_last_month_from_date(this_month)
                yield self.get_call_detail(response, sms_code, last_month)
            else:
                # 请求账单
                form_data = {
                    "content": {
                        "fieldData": {
                            "accnbr": username,
                            "queryflag": "0",
                            "queryType": "0",
                        },
                        "attach": "test"
                    },
                    "headerInfos": {
                        "timestamp": strftime("%Y%m%d%H%M%S"),
                        "code": "queryThisMonthBill",
                        "source": "110003",
                        "token": meta["login_token"],
                        "userLoginName": username,
                        "sourcePassword": "******",
                        "clientType": "#6.2.1#channel8#Huawei DUK-AL20#",
                    }
                }
                yield Request("https://appgo.189.cn:8443/query/bill/queryThisMonthBill",
                              self.parse_this_month_bill, "POST", self.appgo_headers,
                              json_dumps(form_data), meta=meta, dont_filter=True,
                              errback=self.err_callback)
        except CaptchaTimeout:
            yield item
            yield from self.error_handle(username, "电信---解析验证码失败,等待验证码超时",
                                         tell_msg="等待验证码超时,请刷新页面重试。")
        except Exception:
            yield item
            yield from self.except_handle(username, "电信---解析通话记录失败: %s" % res_text)