def getCaptcha(): global sess, vl5x ua = random.choice(ua_list) while True: try: # 获取验证码 发送验证码 验证guid yzm = execjs.compile(''' function createGuid() { return (((1 + Math.random()) * 0x10000) | 0).toString(16).substring(1); } function ref() { var guid = createGuid() + createGuid() + "-" + createGuid() + "-" + createGuid() + createGuid() + "-" + createGuid() + createGuid() + createGuid(); //CreateGuid(); return guid; } ''') guid = yzm.call('ref') yzm_url = 'http://wenshu.court.gov.cn/ValiCode/CreateCode/?guid={}'.format( guid) headers = { 'User-Agent': ua, 'X-Forwarded-For': '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254), random.randint(1, 254), random.randint(1, 254), random.randint(1, 254)) } # yzm_url = 'http://wenshu.court.gov.cn/User/ValidateCode?guid={}'.format(guid) yzm = sess.get(yzm_url, headers=headers, allow_redirects=False) if yzm.status_code == 302: sess, vl5x = getSess() continue if yzm.status_code >= 500: print 'the service is bad and response_status_code is %s, wait one minute retry' % ( yzm.status_code) time.sleep(60) continue with open('captcha.jpg', 'wb') as f: f.write(yzm.content) captcha = result_captcha('captcha.jpg') return captcha, guid except Exception as e: print 'get captcah bad retry again' pass
def insertMysql(docID, anyou, count): ''' 根据文档ID爬取文书 :param docID: 文书ID :param anyou: 案由条件 :param count: 案由条件对应的文档个数 :return: ''' global sess, vl5x doc_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}'.format( docID) i = 0 while True: try: ua = random.choice(ua_list) headers = { 'User-Agent': ua, 'X-Forwarded-For': '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254), random.randint(1, 254), random.randint(1, 254), random.randint(1, 254)) } doc_response = sess.get(doc_url, headers=headers) if doc_response.status_code == 302: #重定向 重新获取cookie sess, vl5x = getSess() continue if doc_response.status_code >= 500: #服务器出错 print 'the service is bad and response_status_code is %s, wait one minute retry' % ( doc_response.status_code) time.sleep(60) continue try: data_unicode = json.loads( doc_response.text) # 如果返回数据异常,对返回的数据进行序列化 if data_unicode == u'remind key': # cookie 到期 sess, vl5x = getSess() print 'getFirstPage response content is remind key retry again' continue if data_unicode == u'remind': # cookie访问次数过多 sess, vl5x = getSess() ua = random.choice(ua_list) remind_captcha = sess.get( 'http://wenshu.court.gov.cn/User/ValidateCode', headers=headers) img = retrive_img(remind_captcha) img = process_img(img) captcha = recognize(img) captcha_data = {'ValidateCode': captcha} sess.post( 'http://wenshu.court.gov.cn/Content/CheckVisitCode', headers=headers, data=captcha_data) print 'getFirstPage response content is remind retry again' continue except Exception as e: pass # 有特殊符号前面加R 先提出整体 再处理特殊符号 title = re.search(u'''.*?"Title(.*?)PubDate''', doc_response.content).group(1) pubDate = re.search(u'''.*?PubDate(.*?)Html''', doc_response.content).group(1) title = re.match(r'\\":\\"(.*?)\\",\\"', title).group(1).decode('utf-8') pubDate = re.match(r'\\":\\"(.*?)\\",\\"', pubDate).group(1) select = Selector(doc_response) doc_list = select.xpath("//div//text()").extract() content = ' '.join(doc_list) doc = u'标题:' + title + u',发布日期:' + pubDate + u',正文:' + content cursor.execute( '''insert into wenshu(anyou, count, doc_id,content) VALUES (%s,%s,%s,%s)''', (anyou, count, docID, doc)) conn.commit() time.sleep(1) break except Exception as e: time.sleep(2) i += 1 if i == 5: message = anyou + ': ' + str(docID) + str(e).decode( 'utf-8') + ' ' + 'is bad' logger.error(message) print(message) break continue
def getFirstPage(condition, total, index=1): ''' 获取案由的具体页面 :param condition: :param total: :param index: :return: ''' global sess, vl5x anyou = ','.join(total) num_level = 0 level = condition[0:4] if u'一级案由' == level: num_level = 1 elif u'二级案由' == level: num_level = 2 elif u'三级案由' == level: num_level = 3 elif u'四级案由' == level: num_level = 4 i = 0 while i < 5: captcha, guid = getCaptcha() # 每次请求都要用到 form_data = { 'Param': condition, 'Index': index, 'Page': 20, 'Order': '法院层级', 'Direction': 'asc', 'vl5x': vl5x, 'number': captcha, 'guid': guid, } try: ua = random.choice(ua_list) headers = { 'User-Agent': ua, 'X-Forwarded-For': '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254), random.randint(1, 254), random.randint(1, 254), random.randint(1, 254)) } print num_level * ' ' + '%s is crawling index is %s' % ( condition, index) response = sess.post('http://wenshu.court.gov.cn/List/ListContent', headers=headers, data=form_data) if response.status_code == 302: sess, vl5x = getSess() continue if response.status_code >= 500: print 'the service is bad and response_status_code is %s, wait one minute retry' % ( response.status_code) time.sleep(60) continue # 返回的数据进行序列化 data_unicode = json.loads(response.text) if data_unicode == u'remind key': # cookie 到期 sess, vl5x = getSess() print 'getFirstPage response content is remind key retry again' continue if data_unicode == u'remind': # cookie访问次数过多 sess, vl5x = getSess() ua = random.choice(ua_list) remind_captcha = sess.get( 'http://wenshu.court.gov.cn/User/ValidateCode', headers=headers) img = retrive_img(remind_captcha) img = process_img(img) captcha = recognize(img) captcha_data = {'ValidateCode': captcha} sess.post('http://wenshu.court.gov.cn/Content/CheckVisitCode', headers=headers, data=captcha_data) print 'getFirstPage response content is remind retry again' continue # 每一页的docID id_list = re.findall(u'''.*?"文书ID\\":\\"(.*?)\\",''', data_unicode) # count是根据条件condition 筛选出来的总文档数 根据count决定要爬多少页 data_list = json.loads(data_unicode) if len(data_list) == 0: time.sleep(2) print 'getFirstPage response content is [] retry again' continue count = data_list[0]['Count'] count = int(count) return count, id_list except Exception as e: i += 1 if i == 5: message = anyou + ': ' + str(index) + str(e).decode( 'utf-8') + ' ' + 'is bad' logger.error(message) print(message) return '', ''
__email__ = "*****@*****.**" '''只能爬取100页 没有添加查询条件''' import execjs import json import re import pymysql import math from wenshu.utils.wenshu_log import getLogger from wenshu.utils.wenshu_session import getSess from wenshu.utils.yundama import result_captcha import time from wenshu.utils.captcha_local import retrive_img, process_img, recognize import random from scrapy.selector import Selector import requests sess, vl5x = getSess() logger = getLogger(__name__) conn = pymysql.Connection(host='127.0.0.1', user='******', password='******', database='crawl', charset='utf8mb4', use_unicode=True) cursor = conn.cursor() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', 'X-Forwarded-For':
def getLastLevel(level, name, f, total): ''' 定位到最底层的子案由 :param level: 案由层级 一级案由 二级案由 三级案由 四级案由 :param name: 案由名称 :param f: :param total: 当前案由的 所有父类案由 :return: ''' global sess, vl5x, FLAG if isinstance(level, str): level = level.decode('utf-8') if isinstance(name, str): name = name.decode('utf-8') condition = level + ':' + name # 判断level是几级案由 if u'一级案由' == level: num_level = 1 total[0] = condition elif u'二级案由' == level: num_level = 2 total[1] = condition elif u'三级案由' == level: num_level = 3 total[2] = condition elif u'四级案由' == level: num_level = 4 total[3] = condition # 构造请求案由请求表单 form_data = {'Param': level + ':' + name, 'parval': name} i = 0 while i < 3: try: response = sess.post( 'http://wenshu.court.gov.cn/List/ReasonTreeContent', headers=headers, data=form_data) if response.status_code == 302: #网站重定向 重新获取cookie (sess, vl5x) = getSess() # getLastLevel(level, name, f, total) continue if response.status_code >= 500: print 'the service is bad and response_status_code is %s, wait one minute retry' % ( response.status_code) time.sleep(60) continue key_data = json.loads(response.text) # Unicode reasonTree = json.loads(key_data) # dict next_level = reasonTree[0]['Key'] # 下一级案由 count = reasonTree[0]['Value'] # 获取当前案由的子类案由个数 如果子类案由数为0 则可以进行爬取 count = int(count) if count == 0: # 说明到了最底层 可以构造参数 访问文档 print num_level * ' ', level, name, 'is crawling' f.write((num_level * ' ' + level + ': ' + name + '\n').encode('utf-8')) f.flush() # if name == u'过失以危险方法危害公共安全': # FLAG = True if True: crwal_condition(condition, total) return else: print num_level * ' ', level, name f.write((num_level * ' ' + level + ': ' + name + '\n').encode('utf-8')) f.flush() key_list = re.findall(u'.*?\{\\"Key\\":\\"(.*?)\\",\\"Value\\"', key_data) key_list = [i for i in key_list if i != ''] #获取当前案由下的子案由 break except Exception as e: time.sleep(2) if i == 2: print e print num_level * ' ', level, name logger.error(condition + ' ' + ','.join(total) + str(e).decode('utf-8')) return i += 1 if next_level == level: #如果子案由==当前案由会造成死循环 return for next_name in key_list[1:]: # Value = child_dice['Value'] # print next_level,next_name, Value time.sleep(2) getLastLevel(next_level, next_name, f, total)
from crawl import crwal_condition from wenshu.utils.wenshu_log import getLogger from wenshu.utils.wenshu_session import getSess '''递归 深度爬取 以案由为选择条件 爬取总数小于等于2000的案由''' logger = getLogger(__name__) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0', 'X-Forwarded-For': '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254), random.randint(1, 254), random.randint(1, 254), random.randint(1, 254)) } # 获取sess和加密后的cookie字段 (sess, vl5x) = getSess() FLAG = False def getLastLevel(level, name, f, total): ''' 定位到最底层的子案由 :param level: 案由层级 一级案由 二级案由 三级案由 四级案由 :param name: 案由名称 :param f: :param total: 当前案由的 所有父类案由 :return: ''' global sess, vl5x, FLAG if isinstance(level, str): level = level.decode('utf-8')