def free_time(pool: ProxyPool): now_time = datetime.datetime.now() hours = now_time.strftime('%H') int_hours = int(hours) ip_proxy_cache = pool.get_ip_proxy_cache() if int_hours in (19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8): logging.info("=*= 空闲时间内 =*=") if ip_proxy_cache != IP_PROXY_FREE_CACHE: logging.info("=*= 空闲时间内 改变IP数量 =*=") pool.change_ip_proxy_cache(IP_PROXY_FREE_CACHE) else: logging.info("=*= 正常时间内 =*=") if ip_proxy_cache != IP_PROXY_CACHE: logging.info("=*= 正常时间内 改变IP数量 =*=") pool.change_ip_proxy_cache(IP_PROXY_CACHE)
async def async_post_get_vjkl5_url(client, guid, proxies={}, url=""): """ 获取vjkl5值 :param client: :param guid: :param proxies: :param url: :return: """ headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Host': 'wenshu.court.gov.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': ProxyPool.get_random_header(), } payload = { "guid": "", "sorttype": 1, "number": "", "conditions": 'searchWord 2 AJLX 案件类型:民事案件', } writ_content = await client.post(url=url, proxy_headers=headers, data=payload, timeout=10, proxy=proxies.get("http")) assert writ_content.status == 200 vjkl5 = writ_content.cookies.get("vjkl5") _ret = re.findall('vjkl5=(.*?);', str(vjkl5))[0] logging.info(_ret) if writ_content: writ_content.close() return _ret
datefmt='%a, %d %b %Y %H:%M:%S', filemode='a', ) import datetime import json import execjs import requests # import tools # from task_schema import db from lawcase.js import wen_shu_js import util.post_util as post_util from util.post_util import get_random_header from plan.plan_config import TABLE_NAME_SUFFIX pool = ProxyPool() pool.change_ip_proxy_cache(1) class SearchConditionBean(object): """ 搜索条件 """ __range_format = "{}:{} TO {}" # 时间区间 __str_format = "{}:{}" # def __init__(self, name, value: list): """ 条件初始化 :param name:条件名字 :param value:条件值
async def async_get_data_javascript_step2(client, doc_id): payload = { "DocID": doc_id, } headers = { 'Accept': 'text/javascript, application/javascript, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Proxy - Connection': 'keep - alive', "Referer": "http://wenshu.court.gov.cn/content/content?DocID={}&KeyWord=".format( doc_id), 'Host': 'wenshu.court.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } ip_proxy_item = proxy_pool.extract_cache_ip_proxy_item() proxy = ip_proxy_item.proxies.get("http") logging.info(str("doc_id=" + doc_id) + ";===proxy===" + proxy) writ_content = None try: ProxyPool.check_proxy(proxy) # 代理检查 writ_content = await client.post( url= 'http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID={}' .format(doc_id), proxy_headers=headers, data=payload, timeout=18, proxy=proxy) java_script = await writ_content.text() assert writ_content.status == 200 # 检查内容是否正确*** 开始 *** if java_script and "window.location.href" in java_script: logging.info("---ip已经可能被封--- 【{}】".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=6) return elif java_script and "<title>出错啦</title>" in java_script: logging.info("---获取内容失败--- 【{}】".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=1) return # 检查内容是否正确*** 结束 *** CaseDetailDao.update_case_detail(doc_id, java_script, ip_proxy_item) CaseDetailDao.remove_doc_id(doc_id) proxy_pool.success(ip_proxy_item) except AssertionError: logging.error("---AssertionError--- {} 【{}】".format( str(writ_content.status), ip_proxy_item)) if writ_content.status == 503: proxy_pool.fail(ip_proxy_item) elif writ_content.status == 429: pass elif writ_content.status == 502: pass else: logging.error("writ_content.status={} proxy={}".format( str(writ_content.status), proxy)) proxy_pool.fail(ip_proxy_item) except NotIpProxyException: logging.error("===没有获取使用ip===") proxy_pool.refresh(ip_proxy_item) except ClientProxyConnectionError: logging.error( "---ClientProxyConnectionError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=10) except ClientOSError: logging.error("---ClientOSError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except aiohttp.client_exceptions.ClientPayloadError: logging.error("---ClientPayloadError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except TimeoutError: logging.error("---TimeoutError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except concurrent_TimeoutError: logging.error("---concurrent_TimeoutError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except aiohttp.client_exceptions.ServerDisconnectedError: logging.error("---ServerDisconnectedError--- {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=2) except Exception: CaseDetailDao.remove_doc_id(doc_id) proxy_pool.refresh(ip_proxy_item) logging.exception("error=>:") finally: if writ_content: writ_content.close()
from concurrent.futures._base import TimeoutError as concurrent_TimeoutError import doc_formatter from proxy.pool import ProxyPool from dao.lawcase import CaseDetailDao from aiohttp.client_exceptions import ClientOSError from proxy.pool import NotIpProxyException logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filemode='a', ) proxy_pool = ProxyPool() def download_doc(doc_id): try: text = save_data_javascript_file(doc_id, IpPort.proxies, config.save_data_javascript_file_path) if text.find("此篇文书不存在!") > 0: return "文档不存在" return proceed_data_javascript() except Exception as e: logging.error(e) def download_doc_html(doc_id): format_page = None
int_hours = int(hours) ip_proxy_cache = pool.get_ip_proxy_cache() if int_hours in (19, 20, 21, 22, 23, 1, 2, 3, 4, 5, 6, 7, 8): logging.info("=*= 空闲时间内 =*=") if ip_proxy_cache != IP_PROXY_FREE_CACHE: logging.info("=*= 空闲时间内 改变IP数量 =*=") pool.change_ip_proxy_cache(IP_PROXY_FREE_CACHE) else: logging.info("=*= 正常时间内 =*=") if ip_proxy_cache != IP_PROXY_CACHE: logging.info("=*= 正常时间内 改变IP数量 =*=") pool.change_ip_proxy_cache(IP_PROXY_CACHE) if __name__ == '__main__': pool = ProxyPool() task_pool = [] while True: loop = asyncio.get_event_loop() task_pool = copy.deepcopy(CaseDetailDao.task_pool) free_time(pool) CaseDetailDao.init(pool.get_ip_proxy_cache() * IP_PEER_DOC_WEIGHT, task_pool) if not CaseDetailDao.task_pool: logging.info("没有任务,休眠5秒") time.sleep(5) continue CaseDetailDao.task_pool = list(set(CaseDetailDao.task_pool)) pool.validate_init_ip_proxy() loop.run_until_complete(
async def proceed_schema(bean): """ 下载任务 :return: """ if not bean: logging.warning("没有任务") return False if bean.page_index == 0 or bean.page_index is None: bean.page_index = 1 try: json_text = None try: ip_proxy_item = proxy_pool.extract_cache_ip_proxy_item() ProxyPool.check_proxy(ip_proxy_item) # 检查代理 proxies = ip_proxy_item.proxies json_text = await _proceed_schema(param=bean.schema_search, index=bean.page_index, page=bean.page, proxies=proxies) except AssertionError: proxy_pool.fail(ip_proxy_item, multiple=1) logging.error("=== AssertionError === {}".format(ip_proxy_item)) except NotIpProxyException: logging.error("=== 没有获取使用ip ===") proxy_pool.refresh(ip_proxy_item) except ClientProxyConnectionError: logging.error("=== ClientProxyConnectionError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=10) except ClientOSError: logging.error("=== ClientOSError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except aiohttp.client_exceptions.ClientPayloadError: logging.error("=== ClientPayloadError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except TimeoutError: logging.error("=== TimeoutError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except concurrent.futures._base.TimeoutError: logging.error("=== concurrent_TimeoutError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item) except aiohttp.client_exceptions.ServerDisconnectedError: logging.error("=== ServerDisconnectedError {}".format(ip_proxy_item)) proxy_pool.fail(ip_proxy_item, multiple=2) except Exception: proxy_pool.fail(ip_proxy_item, multiple=10) logging.exception("error=>: {}".format(ip_proxy_item)) # 检查内容是否正确 success = False if not json_text: return elif "remind" in json_text: logging.info("[***---remind---***] param=" + bean.schema_search + ";page=" + str(bean.page_index) + ";***") proxy_pool.fail(ip_proxy_item, multiple=5) elif "RunEval" in json_text: proxy_pool.success(ip_proxy_item) if "Count" in json_text: success = True logging.info("[***---success---***] param=" + bean.schema_search + ";page=" + str(bean.page_index) + ";***") else: logging.info("[***--repeat---***] param=" + bean.schema_search + ";page=" + str(bean.page_index) + ";有RunEval值,但没有Count值***") success = True bean.process = LawyerInfoBean.PROCESS_5 # 没有成功不解析 if not success: return if '"[{\\' in json_text and bean.process != LawyerInfoBean.PROCESS_5: json_text = json_text.replace('\\"', '\"')[1:-1] # 转移字符 batch_count = int(json.loads(json_text)[0].get("Count")) bean.casenum = batch_count CaseLawyerContextDao.insert_case_lawyer_context(bean.lawyer_id, json_text, bean.page_index, batch_count, bean.page) # 处理成功 if bean.process == LawyerInfoBean.PROCESS_5: pass elif (bean.page_index * bean.page >= batch_count): bean.process = LawyerInfoBean.PROCESS_3 elif bean.page_index * bean.page >= 200 and batch_count > 200: bean.process = LawyerInfoBean.PROCESS_4 else: bean.page_index = bean.page_index + 1 except: logging.exception("===处理失败===") bean.process = LawyerInfoBean.PROCESS_2 # 处理失败 finally: CaseLawyerDao.update(bean.lawyer_id, bean.page_index, bean.process, bean.casenum)
def remove_not_process_data(task_pool): logging.info("=*= 删除不需要爬取的律师队列 ===开始=*=") for _index in range(len(task_pool) - 1, -1, -1): # 倒序循环 data = task_pool[_index] if data and data.process == LawyerInfoBean.PROCESS_1: pass else: logging.info(data) task_pool.remove(data) logging.info("=*= 删除不需要爬取的律师队列 ===结束=*=") if __name__ == '__main__': pool = ProxyPool() pool.change_ip_proxy_cache(1) # 设置代理池一个ip task_pool = [] while True: loop = asyncio.get_event_loop() pool.validate_init_ip_proxy() remove_not_process_data(task_pool) extract_num = LIST_CONTEXT_BATCH_NUM - len(task_pool) bean_list = RedisCaseLawyerTaskMasterHelper.extract_lawyer_info_bean_list( extract_num=extract_num) task_pool.extend(bean_list) if not task_pool: logging.info("=*=没有任务,休眠60秒=*=") time.sleep(60) continue loop.run_until_complete(
@staticmethod def callback_fail(doc_id): try: logging.info("[=*= FAIL =*=] {} ***{}***".format( "FAIL", "发生未识别的错误")) CaseLawyerDocDao.update_sync_status( doc_id=doc_id, state=CaseLawyerDocBean.SYNC_STATUS_09, ) except Exception as e: logging.error(e) task_pool.remove(doc_id) if __name__ == '__main__': pool = ProxyPool() pool.change_ip_proxy_cache( CaseLawyerDocConfig.IP_PROXY_CACHE_NUM__) # 设置代理池一个ip while True: loop = asyncio.get_event_loop() extract_num = CaseLawyerDocConfig.SPIDER_BATCH_NUM - len(task_pool) data_list = RedisCaseLawyerDocMaster.extract(extract_num=extract_num) task_pool.extend(data["doc_id"] for data in data_list) if not task_pool: logging.info("=*= 没有任务,休眠60秒 =*=") time.sleep(60) continue pool.validate_init_ip_proxy() loop.run_until_complete( asyncio.wait([ download.async_get_data_javascript_callback(doc_id,