コード例 #1
0
ファイル: crawl.py プロジェクト: wyx0722/wenshu
def getCaptcha():
    global sess, vl5x
    ua = random.choice(ua_list)
    while True:
        try:
            # 获取验证码 发送验证码 验证guid
            yzm = execjs.compile('''
            function createGuid() {
                return (((1 + Math.random()) * 0x10000) | 0).toString(16).substring(1);
            }
            function ref() {
                var guid = createGuid() + createGuid() + "-" + createGuid() + "-" + createGuid() + createGuid() + "-" + createGuid() + createGuid() + createGuid(); //CreateGuid();
                return guid;
            }
            ''')
            guid = yzm.call('ref')
            yzm_url = 'http://wenshu.court.gov.cn/ValiCode/CreateCode/?guid={}'.format(
                guid)
            headers = {
                'User-Agent':
                ua,
                'X-Forwarded-For':
                '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254))
            }
            # yzm_url = 'http://wenshu.court.gov.cn/User/ValidateCode?guid={}'.format(guid)
            yzm = sess.get(yzm_url, headers=headers, allow_redirects=False)
            if yzm.status_code == 302:
                sess, vl5x = getSess()
                continue
            if yzm.status_code >= 500:
                print 'the service is bad and response_status_code is %s, wait one minute retry' % (
                    yzm.status_code)
                time.sleep(60)
                continue
            with open('captcha.jpg', 'wb') as f:
                f.write(yzm.content)
            captcha = result_captcha('captcha.jpg')
            return captcha, guid
        except Exception as e:
            print 'get captcah bad retry again'
            pass
コード例 #2
0
ファイル: crawl.py プロジェクト: wyx0722/wenshu
def insertMysql(docID, anyou, count):
    '''
    根据文档ID爬取文书
    :param docID: 文书ID
    :param anyou: 案由条件
    :param count: 案由条件对应的文档个数
    :return:
    '''
    global sess, vl5x
    doc_url = 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}'.format(
        docID)
    i = 0
    while True:
        try:
            ua = random.choice(ua_list)
            headers = {
                'User-Agent':
                ua,
                'X-Forwarded-For':
                '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254))
            }
            doc_response = sess.get(doc_url, headers=headers)

            if doc_response.status_code == 302:  #重定向 重新获取cookie
                sess, vl5x = getSess()
                continue
            if doc_response.status_code >= 500:  #服务器出错
                print 'the service is bad and response_status_code is %s, wait one minute retry' % (
                    doc_response.status_code)
                time.sleep(60)
                continue
            try:
                data_unicode = json.loads(
                    doc_response.text)  # 如果返回数据异常,对返回的数据进行序列化
                if data_unicode == u'remind key':
                    # cookie 到期
                    sess, vl5x = getSess()
                    print 'getFirstPage response content is remind key retry again'
                    continue
                if data_unicode == u'remind':
                    # cookie访问次数过多
                    sess, vl5x = getSess()
                    ua = random.choice(ua_list)
                    remind_captcha = sess.get(
                        'http://wenshu.court.gov.cn/User/ValidateCode',
                        headers=headers)
                    img = retrive_img(remind_captcha)
                    img = process_img(img)
                    captcha = recognize(img)
                    captcha_data = {'ValidateCode': captcha}
                    sess.post(
                        'http://wenshu.court.gov.cn/Content/CheckVisitCode',
                        headers=headers,
                        data=captcha_data)
                    print 'getFirstPage response content is remind  retry again'
                    continue
            except Exception as e:
                pass
            # 有特殊符号前面加R 先提出整体 再处理特殊符号
            title = re.search(u'''.*?"Title(.*?)PubDate''',
                              doc_response.content).group(1)
            pubDate = re.search(u'''.*?PubDate(.*?)Html''',
                                doc_response.content).group(1)
            title = re.match(r'\\":\\"(.*?)\\",\\"',
                             title).group(1).decode('utf-8')
            pubDate = re.match(r'\\":\\"(.*?)\\",\\"', pubDate).group(1)
            select = Selector(doc_response)
            doc_list = select.xpath("//div//text()").extract()
            content = '  '.join(doc_list)
            doc = u'标题:' + title + u',发布日期:' + pubDate + u',正文:' + content
            cursor.execute(
                '''insert into wenshu(anyou, count, doc_id,content) VALUES (%s,%s,%s,%s)''',
                (anyou, count, docID, doc))
            conn.commit()
            time.sleep(1)
            break
        except Exception as e:
            time.sleep(2)
            i += 1
            if i == 5:
                message = anyou + ': ' + str(docID) + str(e).decode(
                    'utf-8') + '   ' + 'is bad'
                logger.error(message)
                print(message)
                break
            continue
コード例 #3
0
ファイル: crawl.py プロジェクト: wyx0722/wenshu
def getFirstPage(condition, total, index=1):
    '''
    获取案由的具体页面
    :param condition:
    :param total:
    :param index:
    :return:
    '''
    global sess, vl5x
    anyou = ','.join(total)
    num_level = 0
    level = condition[0:4]
    if u'一级案由' == level:
        num_level = 1
    elif u'二级案由' == level:
        num_level = 2
    elif u'三级案由' == level:
        num_level = 3
    elif u'四级案由' == level:
        num_level = 4

    i = 0
    while i < 5:
        captcha, guid = getCaptcha()  # 每次请求都要用到
        form_data = {
            'Param': condition,
            'Index': index,
            'Page': 20,
            'Order': '法院层级',
            'Direction': 'asc',
            'vl5x': vl5x,
            'number': captcha,
            'guid': guid,
        }
        try:
            ua = random.choice(ua_list)
            headers = {
                'User-Agent':
                ua,
                'X-Forwarded-For':
                '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254),
                                                    random.randint(1, 254))
            }
            print num_level * '    ' + '%s is crawling index is %s' % (
                condition, index)
            response = sess.post('http://wenshu.court.gov.cn/List/ListContent',
                                 headers=headers,
                                 data=form_data)
            if response.status_code == 302:
                sess, vl5x = getSess()
                continue
            if response.status_code >= 500:
                print 'the service is bad and response_status_code is %s, wait one minute retry' % (
                    response.status_code)
                time.sleep(60)
                continue
            # 返回的数据进行序列化
            data_unicode = json.loads(response.text)

            if data_unicode == u'remind key':
                # cookie 到期
                sess, vl5x = getSess()
                print 'getFirstPage response content is remind key retry again'
                continue
            if data_unicode == u'remind':
                # cookie访问次数过多
                sess, vl5x = getSess()
                ua = random.choice(ua_list)
                remind_captcha = sess.get(
                    'http://wenshu.court.gov.cn/User/ValidateCode',
                    headers=headers)
                img = retrive_img(remind_captcha)
                img = process_img(img)
                captcha = recognize(img)
                captcha_data = {'ValidateCode': captcha}
                sess.post('http://wenshu.court.gov.cn/Content/CheckVisitCode',
                          headers=headers,
                          data=captcha_data)
                print 'getFirstPage response content is remind  retry again'
                continue
            # 每一页的docID
            id_list = re.findall(u'''.*?"文书ID\\":\\"(.*?)\\",''', data_unicode)
            # count是根据条件condition 筛选出来的总文档数 根据count决定要爬多少页
            data_list = json.loads(data_unicode)
            if len(data_list) == 0:
                time.sleep(2)
                print 'getFirstPage response content is [] retry again'
                continue
            count = data_list[0]['Count']
            count = int(count)
            return count, id_list
        except Exception as e:
            i += 1
            if i == 5:
                message = anyou + ': ' + str(index) + str(e).decode(
                    'utf-8') + '   ' + 'is bad'
                logger.error(message)
                print(message)
                return '', ''
コード例 #4
0
ファイル: crawl.py プロジェクト: wyx0722/wenshu
__email__ = "*****@*****.**"
'''只能爬取100页 没有添加查询条件'''
import execjs
import json
import re
import pymysql
import math
from wenshu.utils.wenshu_log import getLogger
from wenshu.utils.wenshu_session import getSess
from wenshu.utils.yundama import result_captcha
import time
from wenshu.utils.captcha_local import retrive_img, process_img, recognize
import random
from scrapy.selector import Selector
import requests
sess, vl5x = getSess()

logger = getLogger(__name__)

conn = pymysql.Connection(host='127.0.0.1',
                          user='******',
                          password='******',
                          database='crawl',
                          charset='utf8mb4',
                          use_unicode=True)
cursor = conn.cursor()

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
    'X-Forwarded-For':
コード例 #5
0
def getLastLevel(level, name, f, total):
    '''
    定位到最底层的子案由
    :param level: 案由层级 一级案由 二级案由 三级案由 四级案由
    :param name: 案由名称
    :param f:
    :param total: 当前案由的 所有父类案由
    :return:
    '''
    global sess, vl5x, FLAG
    if isinstance(level, str):
        level = level.decode('utf-8')
    if isinstance(name, str):
        name = name.decode('utf-8')

    condition = level + ':' + name

    # 判断level是几级案由
    if u'一级案由' == level:
        num_level = 1
        total[0] = condition
    elif u'二级案由' == level:
        num_level = 2
        total[1] = condition
    elif u'三级案由' == level:
        num_level = 3
        total[2] = condition
    elif u'四级案由' == level:
        num_level = 4
        total[3] = condition
    # 构造请求案由请求表单
    form_data = {'Param': level + ':' + name, 'parval': name}
    i = 0
    while i < 3:
        try:
            response = sess.post(
                'http://wenshu.court.gov.cn/List/ReasonTreeContent',
                headers=headers,
                data=form_data)
            if response.status_code == 302:  #网站重定向 重新获取cookie
                (sess, vl5x) = getSess()
                # getLastLevel(level, name, f, total)
                continue
            if response.status_code >= 500:
                print 'the service is bad and response_status_code is %s, wait one minute retry' % (
                    response.status_code)
                time.sleep(60)
                continue
            key_data = json.loads(response.text)  # Unicode
            reasonTree = json.loads(key_data)  # dict
            next_level = reasonTree[0]['Key']  # 下一级案由
            count = reasonTree[0]['Value']  # 获取当前案由的子类案由个数 如果子类案由数为0 则可以进行爬取
            count = int(count)
            if count == 0:  # 说明到了最底层 可以构造参数 访问文档
                print num_level * '    ', level, name, 'is crawling'
                f.write((num_level * '    ' + level + ': ' + name +
                         '\n').encode('utf-8'))
                f.flush()
                #                 if name == u'过失以危险方法危害公共安全':
                #                     FLAG = True
                if True:
                    crwal_condition(condition, total)
                return
            else:
                print num_level * '    ', level, name
                f.write((num_level * '    ' + level + ': ' + name +
                         '\n').encode('utf-8'))
                f.flush()
            key_list = re.findall(u'.*?\{\\"Key\\":\\"(.*?)\\",\\"Value\\"',
                                  key_data)
            key_list = [i for i in key_list if i != '']  #获取当前案由下的子案由
            break
        except Exception as e:
            time.sleep(2)
            if i == 2:
                print e
                print num_level * '    ', level, name
                logger.error(condition + '    ' + ','.join(total) +
                             str(e).decode('utf-8'))
                return
            i += 1

    if next_level == level:  #如果子案由==当前案由会造成死循环
        return

    for next_name in key_list[1:]:
        # Value = child_dice['Value']
        # print next_level,next_name, Value
        time.sleep(2)
        getLastLevel(next_level, next_name, f, total)
コード例 #6
0
from crawl import crwal_condition
from wenshu.utils.wenshu_log import getLogger
from wenshu.utils.wenshu_session import getSess
'''递归 深度爬取 以案由为选择条件 爬取总数小于等于2000的案由'''
logger = getLogger(__name__)
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
    'X-Forwarded-For':
    '{}.{}.{}.{},113.88.176.160'.format(random.randint(1, 254),
                                        random.randint(1, 254),
                                        random.randint(1, 254),
                                        random.randint(1, 254))
}
# 获取sess和加密后的cookie字段
(sess, vl5x) = getSess()
FLAG = False


def getLastLevel(level, name, f, total):
    '''
    定位到最底层的子案由
    :param level: 案由层级 一级案由 二级案由 三级案由 四级案由
    :param name: 案由名称
    :param f:
    :param total: 当前案由的 所有父类案由
    :return:
    '''
    global sess, vl5x, FLAG
    if isinstance(level, str):
        level = level.decode('utf-8')