# -*- coding: utf-8 -*- import random import time from urllib import parse import requests from bs4 import BeautifulSoup from util.LoggerClass import Logger from util.configutil import getconfig logger = Logger(logname= 'newspaper',logger='nfrb').getlog() def parse_url(): list = [] tempurl = formatUrl() # print(tempurl) try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} resp = requests.get(tempurl, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser') div = soup.find('div',attrs={'id':'btdh'}) for link in div.find_all('a'): path = link.get('href') title = link.get_text() realpath = parse.urljoin(tempurl,path) if len(title.strip()) <= 8 or '版' in title:
# -*- coding: utf-8 -*- import importlib import queue import threading import time from service import * from datetime import datetime,date from util import configutil from util import esutil from util.LoggerClass import Logger logger = Logger(logname= 'newspaper',logger='run').getlog() #任务列表 options = [] q = queue.Queue() def fetchUrl(q): while True: try: taskName = q.get_nowait() name = importlib.import_module('.%s'%taskName,package='service') except Exception as e: logger.info(e) break # print('Current Thread Name %s, Url: %s ' % (threading.currentThread().name,taskName)) try: result = name.parse_url() for kv in result: es_operate(kv) if result.__len__() != 0:
# -*- coding: utf-8 -*- import time import requests from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger import random logger = Logger(logname='newspaper', logger='hainanrb').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 url = tempurl.format(i) # print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser') div = soup.find('div', attrs={'id': 'main-ed-articlenav-list'})
# -*- coding: utf-8 -*- import time import random import requests from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='cjrbwh').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 url = tempurl.format(i) # print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser')
# -*- coding: utf-8 -*- import time import requests from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger import random logger = Logger(logname='newspaper', logger='hebnews').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 str = "%02d" % i url = tempurl.format(str) # print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser')
import json import random import time from urllib import parse import execjs import requests from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from util import esutil from util.LoggerClass import Logger from util.configutil import getconfig logger = Logger(logname='pjws', logger='pjws').getlog() user_agents = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
import execjs import requests from Cryptodome.Cipher import DES3 from Cryptodome.Util.Padding import unpad from selenium import webdriver from util import esutil from util.LoggerClass import Logger from util.configutil import getconfig """ 2019年9月份文书网spider更新,简单看了下文书网更新过后的加密方式,整体比以前简单不少, 总结起来大概就是ciphertext这个参数是变化的,其他的基本上不会改变,传入data获取数据后,会有一个 DES3解密,其他的好像没什么难点(有可能没遇到坑),就大概写个逻辑脚本,需要完善 """ logger = Logger(logname='pjws', logger='pjws').getlog() # ----------------------------------自定义函数------------------------------------------- def get_cookie(): driver = webdriver.Chrome() driver.get('http://wenshu.court.gov.cn') cookie = driver.get_cookie('SESSION').get('value') print(cookie) return cookie # 获取ciphertext参数
# -*- coding: utf-8 -*- import time import requests import random from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='xhrbjs').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 url = tempurl.format(i) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser') ul = soup.find(id='articlelist')
# -*- coding: utf-8 -*- import hashlib import time from elasticsearch import Elasticsearch from util import configutil from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='esutil').getlog() try: host = configutil.getconfig('eshost', 'host') port = configutil.getconfig('eshost', 'port') es = Elasticsearch([{'host': host, 'port': port}]) except Exception as ex: logger.info(ex) def insert_single_data(index_name, doc_type, data, esid): try: res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid) return res except Exception as e: logger.info(e) def insert_datas(index_name, doc_type, datas): try: res = es.bulk(index=index_name, doc_type=doc_type, body=datas) return res
# -*- coding: UTF-8 -*- import time from apscheduler.schedulers.blocking import BlockingScheduler from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.chrome.options import Options from util import configutil from util import esutil from urllib import parse from util.LoggerClass import Logger logger = Logger(logname='cookieParse', logger='cookieParse').getlog() sched = BlockingScheduler() def cookie_Parse(url): try: options = Options() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('blink-settings=imagesEnabled=false') driver = webdriver.Chrome(chrome_options=options) driver.get(url) time.sleep(5) return driver.page_source except Exception as e: logger.info('{}在使用cookie_Parse方法解析时,过程出现异常\n{}'.format(url, e)) return '解析过程出现异常' finally:
# -*- coding: utf-8 -*- import time import requests from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger import random logger = Logger(logname= 'newspaper',logger='guizhourb').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 str = "%02d"%i url = tempurl.format(str) # print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if resp.status_code == 200: soup = BeautifulSoup(html, 'html.parser') div = soup.find('div',attrs={'class':'newslist'}) for link in div.find_all('a'): path = link.get('href')
# -*- coding: utf-8 -*- import time import random import requests from util.configutil import getconfig from bs4 import BeautifulSoup from urllib import parse from util.LoggerClass import Logger logger = Logger(logname='newspaper', logger='jfrbsh').getlog() def parse_url(): list = [] tempurl = formatUrl() flag = True i = 0 try: while flag: i += 1 str = "%02d" % i url = tempurl.format(str) # print(url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } resp = requests.get(url, headers=headers, timeout=10) resp.encoding = resp.apparent_encoding html = resp.text if html != '':