Ejemplo n.º 1
0
 def get_json(self):
     company_list = read_company2(self.path)
     for c in company_list:
         company = c.strip()
         if company:
             i = Item_dump(company)
             ret = i.item_dump()
             if not ret:
                 item = {}
                 item["company"] = company
                 item["type"] = self.type
                 nd = int(time.time()) * 1000
                 nd1 = nd + 2500000
                 url = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=&iname={}&areaName=&ie=utf-8&oe=utf-8&format=json&t={}&cb=jQuery1102025075011778413225_{}&_={}".format(
                     company, nd1, self.code, nd)
                 u = Rand_ua()
                 ua = u.rand_chose()
                 headers = {
                     "User-Agent": ua,
                 }
                 try:
                     ret = requests.get(url, headers=headers, timeout=60)
                 except Exception as e:
                     with open('log/ss_log.log', 'a') as f:
                         now = str(datetime.datetime.now())
                         f.write(now + ',' + str(e) + ',' + company + ',' +
                                 '失信信息' + '\n')
                     continue
                 json = ret.content.decode()
                 h = Handel_json(json, company)
                 ss_list = h.handel_json()
                 item["失信信息"] = ss_list
                 print(item)
                 self.save_mongodb(item)
                 time.sleep(3)
Ejemplo n.º 2
0
 def __init__(self,company_name):
     u = Rand_ua()
     ua = u.rand_chose()
     self.cp_url = "http://wenshu.court.gov.cn/"
     self.company_name = company_name
     options = webdriver.ChromeOptions()
     path = "E:\python开发环境\chromedriver.exe"
     options.add_argument('--user-agent={}'.format(ua))
     self.driver = webdriver.Chrome(chrome_options=options, executable_path=path)
Ejemplo n.º 3
0
 def __init__(self, path, text_list=None, type=None,proxies=None):
     self.proxies = proxies
     self.u = Rand_ua()
     self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
     self.conn = self.client["qg_ss"]['shesu_info']
     self.type = type  # 公司类型
     self.text_list = text_list  # 测试列表
     self.path = path  # 公司名单路径
     self.captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}'
Ejemplo n.º 4
0
def text_dama():
    u = Rand_ua()
    ua = u.rand_chose()
    headers = {'User-Agent': ua}
    captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}'.format(
        random.randint(999999999999999, 9999999999999999) / 10000000000000000)
    response = requests.get(captcha_url, headers=headers)
    print(type(response.content))
    dmt = DamatuApi("469819183", "54188")
    ret = dmt.decode(response.content, 42)
    print(ret)
Ejemplo n.º 5
0
    def __init__(self, company):
        u = Rand_ua()
        ua = u.rand_chose()
        try:
            options = webdriver.ChromeOptions()
            options.add_argument('--user-agent={}'.format(ua))
            self.driver = webdriver.Chrome(chrome_options=options)
        except Exception as e:
            Log('log/cp_log.log', e=e)
            return

        self.cp_url = "http://wenshu.court.gov.cn/"
        self.company = company
Ejemplo n.º 6
0
def _search_company(company, captcha_url):
    u = Rand_ua()
    ua = u.rand_chose()
    headers = {'User-Agent': ua}
    try:
        captcha_response = requests.get(captcha_url,
                                        headers=headers,
                                        timeout=60)
        # captcha = indetify(captcha_response.content)
        dmt = DamatuApi("469819183", "54188")
        captcha = dmt.decode(captcha_response.content, 42)

    except Exception as e:
        print(e)
        with open("log/shesu_log.log", 'a') as f:
            now = str(datetime.datetime.now())
            f.write(now + ',' + company + ',' + str(e) + '\n')
        return "验证码获取失败!"

    post_data = {
        "searchCourtName": "全国法院(包含地方各级法院)",
        "selectCourtId": 1,
        "selectCourtArrange": 1,
        "pname": company,
        "cardNum": "",
        "j_captcha": captcha,
        "captchaId": "fda97538121240b38b0c73eeac144dbe"
    }
    # print(post_data)
    resp = requests.post('http://zhixing.court.gov.cn/search/newsearch',
                         data=post_data,
                         headers=headers,
                         timeout=60)
    print("查询++++++++++", resp.status_code)
    content = resp.content.decode()
    html = etree.HTML(content)
    # 判断验证码是否错误
    text = html.xpath("//title/text()")[0]
    print("*" * 20, text)
    # 处理验证码,超过5次忽略,计入日志
    assert (text != "验证码出现错误,请重新输入!" and resp.status_code == 200)
    # 请求详情页
    ss_list = get_detail(html, captcha, headers, company)

    return ss_list
Ejemplo n.º 7
0
    def __init__(self, company):
        u = Rand_ua()
        ua = u.rand_chose()

        try:
            options = webdriver.ChromeOptions()
            path = "E:\python开发环境\chromedriver.exe"
            options.add_argument('--user-agent={}'.format(ua))
            # options.add_argument("headless")
            # prefs = {'profile.default_content_setting_values': {'images': 2}}
            # options.add_experimental_option('prefs', prefs)  # 采用无图模式效果不理想
            self.driver = webdriver.Chrome(chrome_options=options,executable_path=path)
        except Exception as e:
            Log('log/cp_log.log', e=e)
            # self.display.stop()
            return

        self.cp_url = "http://wenshu.court.gov.cn/"
        self.company = company
Ejemplo n.º 8
0
 def get_detail(self, href):
     # /content/content?DocID=b18f2733-6f07-4d42-ab8b-d1859ce3222f&KeyWord=江苏和信工程咨询有限公司'
     # http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=d47506cc-644b-4c51-bffe-a71c0102f2b9
     if href != "未获取到文书id":
         d_id = re.findall(r"DocID=(.*?)&", href)
         doc_id = d_id[0] if d_id else 0
         if doc_id:
             u = Rand_ua()
             ua = u.rand_chose()
             headers = {"User-Agent": ua}
             d_url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}".format(
                 doc_id)
             print(d_url)
             try:
                 ret = requests.get(d_url, headers=headers, timeout=60)
             except Exception as e:
                 print("*" * 10, str(e))
                 Log('log/cp_log.log', e=e)
                 return "获取文书内容失败"
             try:
                 html = ret.content.decode()
                 # print(html)
                 xml = etree.HTML(html)
             except Exception as e:
                 print("*" * 10, str(e))
                 Log('log/cp_log.log', e=e)
                 return "获取文书内容失败"
             try:
                 ws_detail = xml.xpath("//body//text()")
             except Exception as e:
                 print("*" * 10, str(e))
                 Log('log/cp_log.log', e=e)
                 return "获取文书内容失败"
             time.sleep(2)
             return ws_detail
     else:
         return "未获取到文书id"
Ejemplo n.º 9
0
class ZhixingSpider:

    def __init__(self, path, text_list=None, type=None,proxies=None):
        self.proxies = proxies
        self.u = Rand_ua()
        self.client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        self.conn = self.client["qg_ss"]['shesu_info']
        self.type = type  # 公司类型
        self.text_list = text_list  # 测试列表
        self.path = path  # 公司名单路径
        self.captcha_url = 'http://zhixing.court.gov.cn/search/captcha.do?captchaId=fda97538121240b38b0c73eeac144dbe&random={}'


    @retry(stop_max_attempt_number=5,stop_max_delay=60000)
    def _search_company(self, company, captcha_url):
        ua = self.u.rand_chose()
        headers = {'User-Agent': ua}
        try:
            captcha_response = requests.get(captcha_url, headers=headers, timeout=60, proxies=self.proxies)
            captcha = indetify(captcha_response.content)
        except Exception as e:
            print(e)
            with open("log/shesu_log.log", 'a') as f:
                now = str(datetime.datetime.now())
                f.write(now+','+company+','+str(e)+'\n')
            return  "验证码获取失败!"

        post_data = {
            "searchCourtName": "全国法院(包含地方各级法院)",
            "selectCourtId": 1,
            "selectCourtArrange": 1,
            "pname": company,
            "cardNum": "",
            "j_captcha": captcha,
            "captchaId": "fda97538121240b38b0c73eeac144dbe"
        }
        # print(post_data)
        resp = requests.post('http://zhixing.court.gov.cn/search/newsearch', data=post_data, headers=headers, timeout=60,proxies=self.proxies)
        print("查询++++++++++", resp.status_code)
        content = resp.content.decode()
        html = etree.HTML(content)
        # 判断验证码是否错误
        text = html.xpath("//title/text()")[0]
        print("*"*20, text)
        # 处理验证码,超过5次忽略,计入日志
        assert (text != "验证码出现错误,请重新输入!" and resp.status_code == 200)
        # 请求详情页
        ss_list = self.get_detail(html, captcha, headers)

        return ss_list


    def search_company(self, company, captcha_url):
        try:
            ss_list = self._search_company(company, captcha_url)

        except Exception as e:
            print(e)
            with open("log/shesu_log.log", 'a') as f:
                now = str(datetime.datetime.now())
                f.write(now+','+company+','+str(e)+'\n')
                ss_list = ["未获取到信息"]
            with open("log/except_company.csv", 'a') as f:
                f.write(company+'\n')

        return ss_list



    def get_detail(self, html, captcha, headers):
        # 判断是否有查询结果,并获取案号id
        ss_list = []
        tr_list = html.xpath("//tbody//tr")
        # print(tr_list)
        if len(tr_list)>1:
            print("*" * 20)
            for tr in tr_list:
                id = tr.xpath(".//td[@align='center']/a/@id")
                if len(id)>0:
                    ss_one = []
                    id = id[0]
                    # print(id)
                    # 拼接详情页的链接
                    # http://zhixing.court.gov.cn/search/newdetail?id=16900266&j_captcha=pzt8&captchaId=fda97538121240b38b0c73eeac144dbe&_=1515212716230
                    time_id = int(time.time())*1000
                    detail_url = "http://zhixing.court.gov.cn/search/newdetail?id={}&j_captcha={}&captchaId=fda97538121240b38b0c73eeac144dbe&_={}".format(id, captcha, time_id)
                    # 发送请求
                    try:
                        ret = requests.get(detail_url, headers=headers, timeout=60,proxies=self.proxies)
                    except Exception as e:
                        with open("log/shesu_log.log", 'a') as f:
                            now = str(datetime.datetime.now())
                            f.write(now + ',' + company + ',' + str(e) + '\n')
                            continue
                    print("查看++++++++++",ret.status_code)
                    ret_json = ret.content.decode()
                    # 将json 格式转换成python类型
                    ret_dic = json.loads(ret_json)
                    # 获取所需的字段 如果字典没有这个建会报异常
                    try:
                        pname = ret_dic["pname"]
                    except:
                        pname = "未获取到"
                    try:
                        caseCode = ret_dic["caseCode"]# 案号
                    except:
                        caseCode = "未获取到"
                    try:
                        caseCreateTime = ret_dic["caseCreateTime"] # 立案时间
                    except:
                        caseCreateTime = "未获取到"
                    try:
                        partyCardNum = ret_dic["partyCardNum"]# 身份证号码
                    except:
                        partyCardNum = "未获取到"
                    try:
                        execCourtName = ret_dic["execCourtName"]# 执行法院
                    except:
                        execCourtName = "未获取到"
                    try:
                        execMoney = ret_dic["execMoney"]# 执行标的
                    except:
                        execMoney = "未获取到"
                    ss_one.append(pname)
                    ss_one.append(caseCode)
                    ss_one.append(caseCreateTime)
                    ss_one.append(partyCardNum)
                    ss_one.append(execCourtName)
                    ss_one.append(execMoney)
                    ss_list.append(ss_one)
                    time.sleep(2)

        return ss_list

    def save_mongodb(self, item):
        self.conn.insert_one(dict(item))
        print("保存成功!")

    def run(self):
        # 给验证码的url拼接16位随机数
        company_list = read_company1(self.path)

        for company in company_list:
            i = Item_dump(company)
            ret = i.item_dump()
            if not ret:
                item = {}
                item["company"] = company
                item["type"] = self.type
                captcha_url = self.captcha_url.format(random.randint(999999999999999,9999999999999999)/10000000000000000)
                ss_list = self.search_company(company, captcha_url)
                item["涉诉信息"] = ss_list
                self.save_mongodb(item)
                print(item)
    # 测试模式
    def run_text(self):
        # 给验证码的url拼接16位随机数
        company_list = read_company2(self.path)
        for company in company_list:
            item = {}
            item["company"] = company
            item["type"] = self.type
            captcha_url = self.captcha_url.format(
                random.randint(999999999999999, 9999999999999999) / 10000000000000000)
            ss_list = self.search_company(company, captcha_url)
            item["涉诉信息"] = ss_list
            print(item)