Esempio n. 1
0
    def parse_ajax(self, response):
        """
		解析json数据
		:param response:
		:return: item
		"""
        self.header["Referer"] = response.url

        if "fs/check" in response.url:
            url = re.search(self.pp_url, response.url).groups()[0]
            meta = response.meta
            cnt2 = meta.get("cnt2", 0)
            if cnt2 < 2:
                url = unquote(unquote(url))
                cnt2 += 1
                logger.info("ajax验证码:{}".format(url))
                yield scrapy.Request(url,
                                     headers=self.header,
                                     callback=self.parse_ajax,
                                     dont_filter=True,
                                     meta={
                                         "cnt2": cnt2,
                                         "old_key": meta["old_key"]
                                     },
                                     priority=420)

        else:
            json_data = json.loads(response.text)
            data = json_data["data"]
            if json_data["status"] != 0 or not data:
                logger.info("[ajax返回空数据]")
                return {}
            item = BaiduHonorItem()
            item["company_name"] = data["entName"]
            item["source_url"] = response.url
            item["credit_code"], item["legal_person"] = data.get(
                "unifiedCode", data.get("regNo", "")), data["legalPerson"]
            item["register_status"], item["establish_date"] = data[
                "openStatus"], data["startDate"]
            item["business_date"], item["register_capital"] = data[
                "openTime"], self.handle_number(data["regCapital"])
            item["address"], item["register_office"] = data["regAddr"], data[
                "authority"]
            item["company_type"], item["business_scope"] = data[
                "entType"], data["scope"]
            item["business_license"] = data.get("licenseNumber", "")
            try:
                item["other"] = self.handle_other(
                    data.get("shares", []), data.get("directors", []),
                    data.get("licenseNumber", ""))
            except Exception as e:
                print("解析other出现异常:", str(e))
                item["other"] = ""
            item["site_name"] = "百度信用"
            item["old_key"] = response.meta["old_key"]
            yield item
Esempio n. 2
0
def modify_redis(com_name, state, old_key):
    # finger = "1_{}".format(com_name)
    v = str(rd_cli.get(old_key))
    if not state:
        new_name = "3_{}".format(com_name)
        rd_cli.rename(old_key, new_name)
    else:
        new_finger = "2_{}".format(com_name)
        new_rd_cli.set(new_finger, v)
        rd_cli.delete(old_key)
        logger.info("old_key={} 删除成功".format(old_key))
Esempio n. 3
0
    def process_item(self, item, spider):
        old_key = item.pop("old_key")
        self.value_fmt(item)
        state = self.write_mysql(item)
        if state:

            logger.info("write tname={} successful!".format("company"))
        else:
            logger.info("write tname={}, company_name={} fail!".format(
                "company", item["company_name"]))

        modify_redis(item["company_name"], state, old_key)
        return item
Esempio n. 4
0
    def write_mysql(self, data):
        insert_sql = self.gen_sql(tname="company",
                                  optype="I",
                                  **{"insert_info": data})
        try:
            mysql_cursor.execute(insert_sql)
            mysql_conn.commit()
        except:
            try:
                mogo_db["err"].insert_one({"data": data})
            except Exception as e:
                if "duplicate" in str(e):
                    logger.info("mongo的公司={}已经存在".format(data["company_name"]))
            mysql_conn.rollback()
            return 0

        else:
            return 1
Esempio n. 5
0
def test_execjs():
    js = """
            function mix(tk, bid) {
                var tkLen = tk.length;
                tk = tk.split('');
                var bdLen = bid.length;
                bid = bid.split('');
                for (var i = 0; i < bdLen; i++){
                    bid[i] = parseInt(bid[i]) + parseInt(tkLen - bdLen);
                }
                for(var i = bdLen - 1; i >= 0; i -= 2) {
                    var tmp = tk[bid[i]];
                    tk[bid[i]] = tk[bid[i - 1]];
                    tk[bid[i - 1]] = tmp;
                }
                return tk.join("");
            };
        """
    ctx = execjs.compile(js)
    # os.environ["EXECJS_RUNTIME"] = "Node"
    result = ctx.call("mix", "xlTM-TogKuTwUdEvSf02*PeznbDZ0iQeKAmd",
                      "231392331688")
    logger.info(result)
Esempio n. 6
0
    def parse_list(self, response):
        """
		解析列表页,获取内容页的链接
		:param response:
		:return:
		"""
        old_key = response.meta["old_key"]
        txt = "".join(
            response.xpath("//div[@class='info']//text()").extract()).replace(
                "\n", "")
        if "抱歉" in txt:
            logger.info("返回异常,{}".format(txt))
            rd_cli.delete(old_key)
            return
        self.header["Referer"] = response.url
        # 验证码
        if "fs/check" in response.url:
            url = re.search(self.pp_url, response.url).groups()[0]
            meta = response.meta
            cnt = meta.get("cnt", 0)
            if cnt < 2:
                url = unquote(unquote(url))
                logger.info("列表验证码:{}".format(url))
                time.sleep(1.5)
                cnt += 1
                yield scrapy.Request(url,
                                     headers=self.header,
                                     callback=self.parse_list,
                                     dont_filter=True,
                                     meta={
                                         "cnt": cnt,
                                         "old_key": old_key
                                     },
                                     priority=300)
        else:
            href_li = self.handle_link(
                response.xpath(
                    "//div[@class='zx-list-item']//a[contains(@class, 'list-item-url')]/@href"
                ).extract())
            for href in href_li:
                logger.info("href={}".format(href))
                yield scrapy.Request(url=href,
                                     headers=self.header,
                                     callback=self.parse_detail,
                                     priority=350,
                                     meta={"old_key": old_key})
Esempio n. 7
0
    def exe_js(self, html, js_str):
        """
		使用execjs执行js字符串
		:param js_str: 待执行的js
		:param html: 解析器, response对象
		:return: tk参数
		"""
        js = """function mix(tk, bid) {tk = tk.split('');var bdLen = bid.length;bid = bid.split('');var dk = parseInt(bid[3]);var one = tk[(parseInt(bid[bdLen - 1]) + dk) % 10];for (var i = bdLen - 1; i >= 0; i -= 1) {tk[(parseInt(bid[i]) + dk) % 10] = tk[(parseInt(bid[i - 1]) + dk) % 10];if ((i - 2) < 0) {tk[(parseInt(bid[i - 1]) + dk) % 10] = one;break;}}return tk.join('');}"""
        ctx = execjs.compile(js)

        pat = re.compile(
            r".*?getElementById\('(\w+)'\)\.getAttribute\('(\w+)'\)",
            re.DOTALL | re.UNICODE)
        _id, _attr_name = re.search(pat, js_str).groups()  # 大写字母

        # 参数tk的解析规则,需要转成小写字母
        tk_rule = "//*[@id='{}']/@{}".format(_id, _attr_name.lower())
        logger.info(tk_rule)
        tk = "".join(html.xpath(tk_rule).extract())
        logger.info("打印tk: {}".format(tk))
        baidu_code = html.xpath("//*[@id='baiducode']/text()").extract_first()
        tot = ctx.call("mix", tk, baidu_code)
        logger.info("tot参数值:{}".format(tot))
        return tot
Esempio n. 8
0
 def run(self, args, opts):
     spider_list = self.crawler_process.spiders.list()
     logger.info("spider list:{}".format(spider_list))
     for name in spider_list:
         self.crawler_process.crawl(name, **opts.__dict__)
     self.crawler_process.start()