コード例 #1
0
def free_time(pool: ProxyPool):
    now_time = datetime.datetime.now()
    hours = now_time.strftime('%H')
    int_hours = int(hours)
    ip_proxy_cache = pool.get_ip_proxy_cache()
    if int_hours in (19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8):
        logging.info("=*= 空闲时间内 =*=")
        if ip_proxy_cache != IP_PROXY_FREE_CACHE:
            logging.info("=*= 空闲时间内 改变IP数量 =*=")
            pool.change_ip_proxy_cache(IP_PROXY_FREE_CACHE)
    else:
        logging.info("=*= 正常时间内 =*=")
        if ip_proxy_cache != IP_PROXY_CACHE:
            logging.info("=*= 正常时间内 改变IP数量 =*=")
            pool.change_ip_proxy_cache(IP_PROXY_CACHE)
コード例 #2
0
async def async_post_get_vjkl5_url(client, guid, proxies={}, url=""):
    """
    获取vjkl5值
    :param client:
    :param guid:
    :param proxies:
    :param url:
    :return:
    """
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Host': 'wenshu.court.gov.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': ProxyPool.get_random_header(),
    }
    payload = {
        "guid": "",
        "sorttype": 1,
        "number": "",
        "conditions": 'searchWord 2 AJLX  案件类型:民事案件',
    }
    writ_content = await client.post(url=url,
                                     proxy_headers=headers,
                                     data=payload,
                                     timeout=10,
                                     proxy=proxies.get("http"))
    assert writ_content.status == 200
    vjkl5 = writ_content.cookies.get("vjkl5")
    _ret = re.findall('vjkl5=(.*?);', str(vjkl5))[0]
    logging.info(_ret)
    if writ_content:
        writ_content.close()
    return _ret
コード例 #3
0
    datefmt='%a, %d %b %Y %H:%M:%S',
    filemode='a',
)
import datetime
import json
import execjs
import requests
# import tools
# from task_schema import db
from lawcase.js import wen_shu_js

import util.post_util as post_util
from util.post_util import get_random_header
from plan.plan_config import TABLE_NAME_SUFFIX

pool = ProxyPool()
pool.change_ip_proxy_cache(1)


class SearchConditionBean(object):
    """
    搜索条件
    """
    __range_format = "{}:{} TO {}"  # 时间区间
    __str_format = "{}:{}"  #

    def __init__(self, name, value: list):
        """
        条件初始化
        :param name:条件名字
        :param value:条件值
コード例 #4
0
async def async_get_data_javascript_step2(client, doc_id):
    payload = {
        "DocID": doc_id,
    }
    headers = {
        'Accept':
        'text/javascript, application/javascript, */*',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-CN,zh;q=0.9',
        'Proxy - Connection':
        'keep - alive',
        "Referer":
        "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=".format(
            doc_id),
        'Host':
        'wenshu.court.gov.cn',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'X-Requested-With':
        'XMLHttpRequest',
    }
    ip_proxy_item = proxy_pool.extract_cache_ip_proxy_item()
    proxy = ip_proxy_item.proxies.get("http")
    logging.info(str("doc_id=" + doc_id) + ";===proxy===" + proxy)
    writ_content = None
    try:
        ProxyPool.check_proxy(proxy)  # 代理检查
        writ_content = await client.post(
            url=
            'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}'
            .format(doc_id),
            proxy_headers=headers,
            data=payload,
            timeout=18,
            proxy=proxy)
        java_script = await writ_content.text()
        assert writ_content.status == 200
        # 检查内容是否正确*** 开始 ***
        if java_script and "window.location.href" in java_script:
            logging.info("---ip已经可能被封--- 【{}】".format(ip_proxy_item))
            proxy_pool.fail(ip_proxy_item, multiple=6)
            return
        elif java_script and "<title>出错啦</title>" in java_script:
            logging.info("---获取内容失败--- 【{}】".format(ip_proxy_item))
            proxy_pool.fail(ip_proxy_item, multiple=1)
            return
        # 检查内容是否正确*** 结束 ***
        CaseDetailDao.update_case_detail(doc_id, java_script, ip_proxy_item)
        CaseDetailDao.remove_doc_id(doc_id)
        proxy_pool.success(ip_proxy_item)
    except AssertionError:
        logging.error("---AssertionError--- {} 【{}】".format(
            str(writ_content.status), ip_proxy_item))
        if writ_content.status == 503:
            proxy_pool.fail(ip_proxy_item)
        elif writ_content.status == 429:
            pass
        elif writ_content.status == 502:
            pass
        else:
            logging.error("writ_content.status={} proxy={}".format(
                str(writ_content.status), proxy))
            proxy_pool.fail(ip_proxy_item)
    except NotIpProxyException:
        logging.error("===没有获取使用ip===")
        proxy_pool.refresh(ip_proxy_item)
    except ClientProxyConnectionError:
        logging.error(
            "---ClientProxyConnectionError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item, multiple=10)
    except ClientOSError:
        logging.error("---ClientOSError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item)
    except aiohttp.client_exceptions.ClientPayloadError:
        logging.error("---ClientPayloadError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item)
    except TimeoutError:
        logging.error("---TimeoutError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item)
    except concurrent_TimeoutError:
        logging.error("---concurrent_TimeoutError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item)
    except aiohttp.client_exceptions.ServerDisconnectedError:
        logging.error("---ServerDisconnectedError--- {}".format(ip_proxy_item))
        proxy_pool.fail(ip_proxy_item, multiple=2)
    except Exception:
        CaseDetailDao.remove_doc_id(doc_id)
        proxy_pool.refresh(ip_proxy_item)
        logging.exception("error=>:")
    finally:
        if writ_content:
            writ_content.close()
コード例 #5
0
from concurrent.futures._base import TimeoutError as concurrent_TimeoutError
import doc_formatter
from proxy.pool import ProxyPool
from dao.lawcase import CaseDetailDao
from aiohttp.client_exceptions import ClientOSError
from proxy.pool import NotIpProxyException

logging.basicConfig(
    level=logging.INFO,
    format=
    '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filemode='a',
)

proxy_pool = ProxyPool()


def download_doc(doc_id):
    try:
        text = save_data_javascript_file(doc_id, IpPort.proxies,
                                         config.save_data_javascript_file_path)
        if text.find("此篇文书不存在!") > 0:
            return "文档不存在"
        return proceed_data_javascript()
    except Exception as e:
        logging.error(e)


def download_doc_html(doc_id):
    format_page = None
コード例 #6
0
    int_hours = int(hours)
    ip_proxy_cache = pool.get_ip_proxy_cache()
    if int_hours in (19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8):
        logging.info("=*= 空闲时间内 =*=")
        if ip_proxy_cache != IP_PROXY_FREE_CACHE:
            logging.info("=*= 空闲时间内 改变IP数量 =*=")
            pool.change_ip_proxy_cache(IP_PROXY_FREE_CACHE)
    else:
        logging.info("=*= 正常时间内 =*=")
        if ip_proxy_cache != IP_PROXY_CACHE:
            logging.info("=*= 正常时间内 改变IP数量 =*=")
            pool.change_ip_proxy_cache(IP_PROXY_CACHE)


if __name__ == '__main__':
    pool = ProxyPool()
    task_pool = []
    while True:
        loop = asyncio.get_event_loop()
        task_pool = copy.deepcopy(CaseDetailDao.task_pool)
        free_time(pool)
        CaseDetailDao.init(pool.get_ip_proxy_cache() * IP_PEER_DOC_WEIGHT,
                           task_pool)
        if not CaseDetailDao.task_pool:
            logging.info("没有任务,休眠5秒")
            time.sleep(5)
            continue

        CaseDetailDao.task_pool = list(set(CaseDetailDao.task_pool))
        pool.validate_init_ip_proxy()
        loop.run_until_complete(
コード例 #7
0
 async def proceed_schema(bean):
     """
     下载任务
     :return:
     """
     if not bean:
         logging.warning("没有任务")
         return False
     if bean.page_index == 0 or bean.page_index is None:
         bean.page_index = 1
     try:
         json_text = None
         try:
             ip_proxy_item = proxy_pool.extract_cache_ip_proxy_item()
             ProxyPool.check_proxy(ip_proxy_item)  # 检查代理
             proxies = ip_proxy_item.proxies
             json_text = await _proceed_schema(param=bean.schema_search,
                                               index=bean.page_index,
                                               page=bean.page,
                                               proxies=proxies)
         except AssertionError:
             proxy_pool.fail(ip_proxy_item, multiple=1)
             logging.error("=== AssertionError === {}".format(ip_proxy_item))
         except NotIpProxyException:
             logging.error("=== 没有获取使用ip ===")
             proxy_pool.refresh(ip_proxy_item)
         except ClientProxyConnectionError:
             logging.error("=== ClientProxyConnectionError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item, multiple=10)
         except ClientOSError:
             logging.error("=== ClientOSError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item)
         except aiohttp.client_exceptions.ClientPayloadError:
             logging.error("=== ClientPayloadError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item)
         except TimeoutError:
             logging.error("=== TimeoutError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item)
         except concurrent.futures._base.TimeoutError:
             logging.error("=== concurrent_TimeoutError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item)
         except aiohttp.client_exceptions.ServerDisconnectedError:
             logging.error("=== ServerDisconnectedError {}".format(ip_proxy_item))
             proxy_pool.fail(ip_proxy_item, multiple=2)
         except Exception:
             proxy_pool.fail(ip_proxy_item, multiple=10)
             logging.exception("error=>: {}".format(ip_proxy_item))
         # 检查内容是否正确
         success = False
         if not json_text:
             return
         elif "remind" in json_text:
             logging.info("[***---remind---***] param=" + bean.schema_search +
                          ";page=" + str(bean.page_index) + ";***")
             proxy_pool.fail(ip_proxy_item, multiple=5)
         elif "RunEval" in json_text:
             proxy_pool.success(ip_proxy_item)
             if "Count" in json_text:
                 success = True
                 logging.info("[***---success---***] param=" + bean.schema_search +
                              ";page=" + str(bean.page_index) + ";***")
             else:
                 logging.info("[***--repeat---***] param=" + bean.schema_search +
                              ";page=" + str(bean.page_index) + ";有RunEval值,但没有Count值***")
                 success = True
                 bean.process = LawyerInfoBean.PROCESS_5
         # 没有成功不解析
         if not success:
             return
         if '"[{\\' in json_text and bean.process != LawyerInfoBean.PROCESS_5:
             json_text = json_text.replace('\\"', '\"')[1:-1]  # 转移字符
             batch_count = int(json.loads(json_text)[0].get("Count"))
             bean.casenum = batch_count
             CaseLawyerContextDao.insert_case_lawyer_context(bean.lawyer_id, json_text,
                                                             bean.page_index, batch_count,
                                                             bean.page)
         # 处理成功
         if bean.process == LawyerInfoBean.PROCESS_5:
             pass
         elif (bean.page_index * bean.page >= batch_count):
             bean.process = LawyerInfoBean.PROCESS_3
         elif bean.page_index * bean.page >= 200 and batch_count > 200:
             bean.process = LawyerInfoBean.PROCESS_4
         else:
             bean.page_index = bean.page_index + 1
     except:
         logging.exception("===处理失败===")
         bean.process = LawyerInfoBean.PROCESS_2  # 处理失败
     finally:
         CaseLawyerDao.update(bean.lawyer_id, bean.page_index, bean.process, bean.casenum)
コード例 #8
0

def remove_not_process_data(task_pool):
    logging.info("=*= 删除不需要爬取的律师队列 ===开始=*=")
    for _index in range(len(task_pool) - 1, -1, -1):  # 倒序循环
        data = task_pool[_index]
        if data and data.process == LawyerInfoBean.PROCESS_1:
            pass
        else:
            logging.info(data)
            task_pool.remove(data)
    logging.info("=*= 删除不需要爬取的律师队列 ===结束=*=")


if __name__ == '__main__':
    pool = ProxyPool()
    pool.change_ip_proxy_cache(1)  # 设置代理池一个ip
    task_pool = []
    while True:
        loop = asyncio.get_event_loop()
        pool.validate_init_ip_proxy()
        remove_not_process_data(task_pool)
        extract_num = LIST_CONTEXT_BATCH_NUM - len(task_pool)
        bean_list = RedisCaseLawyerTaskMasterHelper.extract_lawyer_info_bean_list(
            extract_num=extract_num)
        task_pool.extend(bean_list)
        if not task_pool:
            logging.info("=*=没有任务,休眠60秒=*=")
            time.sleep(60)
            continue
        loop.run_until_complete(
コード例 #9
0
    @staticmethod
    def callback_fail(doc_id):
        try:
            logging.info("[=*= FAIL =*=] {} ***{}***".format(
                "FAIL", "发生未识别的错误"))
            CaseLawyerDocDao.update_sync_status(
                doc_id=doc_id,
                state=CaseLawyerDocBean.SYNC_STATUS_09,
            )
        except Exception as e:
            logging.error(e)
        task_pool.remove(doc_id)


if __name__ == '__main__':
    pool = ProxyPool()
    pool.change_ip_proxy_cache(
        CaseLawyerDocConfig.IP_PROXY_CACHE_NUM__)  # 设置代理池一个ip
    while True:
        loop = asyncio.get_event_loop()
        extract_num = CaseLawyerDocConfig.SPIDER_BATCH_NUM - len(task_pool)
        data_list = RedisCaseLawyerDocMaster.extract(extract_num=extract_num)
        task_pool.extend(data["doc_id"] for data in data_list)
        if not task_pool:
            logging.info("=*= 没有任务,休眠60秒 =*=")
            time.sleep(60)
            continue
        pool.validate_init_ip_proxy()
        loop.run_until_complete(
            asyncio.wait([
                download.async_get_data_javascript_callback(doc_id,