コード例 #1
0
ファイル: nfrb.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import random
import time
from urllib import parse

import requests
from bs4 import BeautifulSoup

from util.LoggerClass import Logger
from util.configutil import getconfig

logger = Logger(logname= 'newspaper',logger='nfrb').getlog()

def parse_url():
    list = []
    tempurl = formatUrl()
    # print(tempurl)
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
        resp = requests.get(tempurl, headers=headers, timeout=10)
        resp.encoding = resp.apparent_encoding
        html = resp.text
        if resp.status_code == 200:
            soup = BeautifulSoup(html, 'html.parser')
            div = soup.find('div',attrs={'id':'btdh'})
            for link in div.find_all('a'):
                path = link.get('href')
                title = link.get_text()
                realpath = parse.urljoin(tempurl,path)
                if len(title.strip()) <= 8 or '版' in title:
コード例 #2
0
ファイル: run.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import importlib
import queue
import threading
import time

from service import *
from datetime import datetime,date
from util import configutil
from util import esutil
from util.LoggerClass import Logger

logger = Logger(logname= 'newspaper',logger='run').getlog()
#任务列表
options = []
q = queue.Queue()

def fetchUrl(q):
    while True:
        try:
            taskName = q.get_nowait()
            name = importlib.import_module('.%s'%taskName,package='service')
        except Exception as e:
            logger.info(e)
            break
        # print('Current Thread Name %s, Url: %s ' % (threading.currentThread().name,taskName))
        try:
            result = name.parse_url()
            for kv in  result:
                es_operate(kv)
            if result.__len__() != 0:
コード例 #3
0
# -*- coding: utf-8 -*-
import time
import requests
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger
import random
logger = Logger(logname='newspaper', logger='hainanrb').getlog()


def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            url = tempurl.format(i)
            # print(url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if resp.status_code == 200:
                soup = BeautifulSoup(html, 'html.parser')
                div = soup.find('div', attrs={'id': 'main-ed-articlenav-list'})
コード例 #4
0
ファイル: cjrbwh.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import time
import random
import requests
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='cjrbwh').getlog()


def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            url = tempurl.format(i)
            # print(url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if resp.status_code == 200:
                soup = BeautifulSoup(html, 'html.parser')
コード例 #5
0
ファイル: hebnews.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import time
import requests
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger
import random
logger = Logger(logname='newspaper', logger='hebnews').getlog()


def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            str = "%02d" % i
            url = tempurl.format(str)
            # print(url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if resp.status_code == 200:
                soup = BeautifulSoup(html, 'html.parser')
コード例 #6
0
import json
import random
import time
from urllib import parse

import execjs
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from util import esutil
from util.LoggerClass import Logger
from util.configutil import getconfig

logger = Logger(logname='pjws', logger='pjws').getlog()

user_agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
コード例 #7
0
import execjs
import requests
from Cryptodome.Cipher import DES3
from Cryptodome.Util.Padding import unpad
from selenium import webdriver

from util import esutil
from util.LoggerClass import Logger
from util.configutil import getconfig
"""
2019年9月份文书网spider更新,简单看了下文书网更新过后的加密方式,整体比以前简单不少,
总结起来大概就是ciphertext这个参数是变化的,其他的基本上不会改变,传入data获取数据后,会有一个
DES3解密,其他的好像没什么难点(有可能没遇到坑),就大概写个逻辑脚本,需要完善
"""
logger = Logger(logname='pjws', logger='pjws').getlog()

# ----------------------------------自定义函数-------------------------------------------


def get_cookie():
    driver = webdriver.Chrome()
    driver.get('http://wenshu.court.gov.cn')
    cookie = driver.get_cookie('SESSION').get('value')
    print(cookie)
    return cookie


# 获取ciphertext参数

コード例 #8
0
ファイル: xhrbjs.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import time
import requests
import random
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='xhrbjs').getlog()


def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            url = tempurl.format(i)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if resp.status_code == 200:
                soup = BeautifulSoup(html, 'html.parser')
                ul = soup.find(id='articlelist')
コード例 #9
0
ファイル: esutil.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-

import hashlib
import time

from elasticsearch import Elasticsearch

from util import configutil
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='esutil').getlog()
try:
    host = configutil.getconfig('eshost', 'host')
    port = configutil.getconfig('eshost', 'port')
    es = Elasticsearch([{'host': host, 'port': port}])
except Exception as ex:
    logger.info(ex)


def insert_single_data(index_name, doc_type, data, esid):
    try:
        res = es.index(index=index_name, doc_type=doc_type, body=data, id=esid)
        return res
    except Exception as e:
        logger.info(e)


def insert_datas(index_name, doc_type, datas):
    try:
        res = es.bulk(index=index_name, doc_type=doc_type, body=datas)
        return res
コード例 #10
0
ファイル: cookieParse.py プロジェクト: mmhan2008/newspaper
# -*- coding: UTF-8 -*-
import time

from apscheduler.schedulers.blocking import BlockingScheduler
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from util import configutil
from util import esutil
from urllib import parse
from util.LoggerClass import Logger

logger = Logger(logname='cookieParse', logger='cookieParse').getlog()
sched = BlockingScheduler()


def cookie_Parse(url):
    try:
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-gpu')
        options.add_argument('blink-settings=imagesEnabled=false')
        driver = webdriver.Chrome(chrome_options=options)
        driver.get(url)
        time.sleep(5)
        return driver.page_source
    except Exception as e:
        logger.info('{}在使用cookie_Parse方法解析时,过程出现异常\n{}'.format(url, e))
        return '解析过程出现异常'
    finally:
コード例 #11
0
ファイル: guizhourb.py プロジェクト: mmhan2008/newspaper
# -*- coding: utf-8 -*-
import time
import requests
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger
import random
logger = Logger(logname= 'newspaper',logger='guizhourb').getlog()

def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            str = "%02d"%i
            url = tempurl.format(str)
            # print(url)
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if resp.status_code == 200:
                soup = BeautifulSoup(html, 'html.parser')
                div = soup.find('div',attrs={'class':'newslist'})
                for link in div.find_all('a'):
                    path = link.get('href')
コード例 #12
0
# -*- coding: utf-8 -*-
import time
import random
import requests
from util.configutil import getconfig
from bs4 import BeautifulSoup
from urllib import parse
from util.LoggerClass import Logger

logger = Logger(logname='newspaper', logger='jfrbsh').getlog()


def parse_url():
    list = []
    tempurl = formatUrl()
    flag = True
    i = 0
    try:
        while flag:
            i += 1
            str = "%02d" % i
            url = tempurl.format(str)
            # print(url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
            }
            resp = requests.get(url, headers=headers, timeout=10)
            resp.encoding = resp.apparent_encoding
            html = resp.text
            if html != '':