Beispiel #1
0
from elasticsearch import Elasticsearch, exceptions
from elasticsearch.helpers import bulk
import re
from requests_html import HTML
from pymongo import MongoClient
from init import config_init, logger_init
from xlrd import open_workbook, xldate_as_tuple
import datetime
from bs4 import BeautifulSoup as bs
from bson import ObjectId

config = config_init()
logger = logger_init('迁移数据至 ES')

es = Elasticsearch([config['Aliyun_ES']['host']], timeout=30)

if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]
Beispiel #2
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('四川省财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def scczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
        'url':
Beispiel #3
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('江西省财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def jxczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [
        {
            'url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_1.shtml',
            'request_url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_',
Beispiel #4
0
import pdfplumber
# imagemagick -> https://github.com/dahlia/wand/issues/327
from PIL import Image, ImageFile
from bs4 import BeautifulSoup as bs
from bs4 import Comment
import requests

from init import logger_init, config_init

# from cmappings import apostrophe_map
# from config import disabled_keys
# from ocr_pdf import baidu_image_ocr
# from oss_utils import initAliOSS, ossAddFile, ossGetFile, saveLocalFile, alioss_base_url

log = logger_init('pdf2html')
temp_dir = 'temp_pdf2html'
rm_flag, img_flag, debug_mode = True, False, False  # whether analyze images in pdf
CHAR_SIZE_UPPER, CHAR_SIZE_LOWER = 30, 1
tocHeadRE = re.compile('^目录$')
tocRE = re.compile(r"(.*?[.…·]+[0-9]+)")  # re.compile('(.*?\.*[0-9]+)')
numRE = re.compile(r".*?([0-9]+)")
tocTextRE = re.compile(r"(.*?)[.…·]+[0-9]+$")  # re.compile('(.*?).*[0-9]+$')
tocLineRE = re.compile(r"^(.*?)[.…·]+[0-9]+$")
tocRomanRE = re.compile(r"[.…·]{2,}[lxvi]+", re.IGNORECASE)  # iv
numRomanRE = re.compile(r"(.*?)([lxvi]+)", re.IGNORECASE)
arabicTocRE = re.compile(r"^[0-9]+[.、·]+")
pageNumRE = re.compile(
    r"^\d+(?:[--]+\d+)*$|^[--]+\d+[--]+$|^[--]*[lxvi]+[--]*$|^\d+(?:-\d+)*[--]+[lxvi]+$",
    re.I)
# 1-1-1 or 11 or -1- or iv or 1-1-XVI
Beispiel #5
0
import xlwt
from pymongo import MongoClient
from datetime import datetime
from init import config_init, logger_init

logger = logger_init('导出数据到xlsx')
config = config_init()

if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]


def get_xlsx(sheet_name):
    workbook = xlwt.Workbook(encoding='ascii')

    worksheet = workbook.add_sheet(sheet_name)
    worksheet.write(0, 0, label='发文名称')
    worksheet.write(0, 1, label='文号')
    worksheet.write(0, 2, label='处罚日期')
Beispiel #6
0
import re

from pymongo import MongoClient
from utility import format_date, request_site_page
from bs4 import BeautifulSoup as bs
from init import logger_init, config_init
from oss_utils import init_ali_oss, oss_add_file

logger = logger_init('中国银行间市场交易商协会 数据解析')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]
ali_bucket = init_ali_oss()


def nafmii_parse():
    for each_nafmii_document in db.nafmii_data.find(
        {'status': {
            '$nin': ['ignored']
Beispiel #7
0
from pymongo import MongoClient
from init import logger_init, config_init
import re
import xlwt

logger = logger_init('处罚决定 法律法规导出')
config = config_init()
db = MongoClient(host=config['mongodb']['mongodb_host'],
                 port=int(config['mongodb']['mongodb_port']),
                 username=None if config['mongodb']['mongodb_username'] == ''
                 else config['mongodb']['mongodb_username'],
                 password=None if config['mongodb']['mongodb_password'] == ''
                 else config['mongodb']['mongodb_password'])[
                     config['mongodb']['mongodb_db_name']]

workbook = xlwt.Workbook(encoding='ascii')

worksheet = workbook.add_sheet('保监|银监|人行')
worksheet.write(0, 0, label='法律法规')

all_law_list = []
# {'announcementOrg': {'$regex': '.*?(保监|银监|人行).*?'}}
for each_announcement in db.punishAnnouncement.find():
    punishment_decision = each_announcement[
        'punishmentBasement'] + '\n' + each_announcement['punishmentDecision']
    punishment_decision = punishment_decision.replace('\r', '').replace(
        '\n', '').replace('\r\n', '')
    law_list = re.findall('(《.*?》((.*?))?)', punishment_decision)
    for each_law in law_list:
        if each_law not in all_law_list:
            all_law_list.append(each_law)
Beispiel #8
0
import re
import jsbeautifier
import js2py
import time

from pymongo import MongoClient
from init import logger_init, config_init
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin

logger = logger_init('地方人民银行-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入pbc_data这个collection
db.pbc_data.create_index([('url', 1)])
Beispiel #9
0
from pymongo import MongoClient
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('黑龙江省律师协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入lawyers_data这个collection
db.lawyers_data.create_index([('url', 1)])


def hljls_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
Beispiel #10
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('河北省财政局-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def hebcz_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #11
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('内蒙古自治区财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def nmgczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [
        {
            'url': 'http://www.nmgp.gov.cn/category/bgt', 'origin': '内蒙古自治区财政厅'
        }
Beispiel #12
0
from pymongo import MongoClient
import re
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('福建省财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def fjczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
Beispiel #13
0
import re

from pymongo import MongoClient
from init import logger_init, config_init
from oss_utils import init_ali_oss, oss_add_file
from bs4 import BeautifulSoup as bs
from utility import request_site_page, get_year, cn2dig

logger = logger_init('中国基金业协会 数据解析')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]
ali_bucket = init_ali_oss()


# 中国基金业协会 数据解析
def amac_parse():
    for each_amac_document in db.amac_data.find({'status': {'$nin': ['ignored']}}):
        announcement_url = each_amac_document['url']
        announcement_title = each_amac_document['title']
        announcement_type = each_amac_document['type']
Beispiel #14
0
import re

from pymongo import MongoClient
from init import logger_init, config_init
from utility import request_site_page, get_content_text, format_date
from bs4 import BeautifulSoup as bs
from oss_utils import init_ali_oss, oss_add_file

logger = logger_init('证监会 数据解析')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

ali_bucket = init_ali_oss()


# 证监会解析
def parse_csrc(url, doc_type, data_id, org):
    logger.info(doc_type)
    logger.info('url to parse ' + url)
    r = request_site_page(url)
    if r is None:
Beispiel #15
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('北京市财政局-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def bjccgp_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #16
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('甘肃省财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def gsczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #17
0
from pymongo import MongoClient
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('广西自治区律师协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入lawyers_data这个collection
db.lawyers_data.create_index([('url', 1)])


def gxlawyer_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #18
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('新疆维吾尔自治区财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def xjczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #19
0
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import math
import re
from init import logger_init, config_init

logger = logger_init('辽宁省律师协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入lawyers_data这个collection
db.lawyers_data.create_index([('url', 1)])


def lnlawyers_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
Beispiel #20
0
from pymongo import MongoClient
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utility import request_site_page
from init import logger_init, config_init

logger = logger_init('山东省财政厅-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入finance_data这个collection
db.finance_data.create_index([('url', 1)])


def sdczt_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{
Beispiel #21
0
import re

from pymongo import MongoClient
from utility import request_site_page
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from init import logger_init, config_init

logger = logger_init('交易商协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入nafmii_data这个collection
db.nafmii_data.create_index([('url', 1)])


# 中国银行间市场交易商协会
def nafmii_crawler():
Beispiel #22
0
from pymongo import MongoClient
from init import logger_init, config_init

logger = logger_init('上交所 数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]

    dev_db = MongoClient(config['mongodb']['ali_mongodb_url'],
                         username=config['mongodb']['ali_mongodb_username'],
                         password=config['mongodb']['ali_mongodb_password'],
                         port=int(config['mongodb']['ali_mongodb_port']))[
                             config['mongodb']['dev_mongodb_db_name']]

    touzhiwang_db = MongoClient(
        config['mongodb']['ali_mongodb_url'],
        username=config['mongodb']['ali_mongodb_username'],
        password=config['mongodb']['ali_mongodb_password'],
        port=int(config['mongodb']['ali_mongodb_port']))[
            config['mongodb']['szse_mongodb_db_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
Beispiel #23
0
import re

from pymongo import MongoClient
from init import logger_init, config_init
from bs4 import BeautifulSoup as bs
from utility import request_site_page
from urllib.parse import urljoin

logger = logger_init('中国证监会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

# 抓取数据存入csrc_data这个collection
db.csrc_data.create_index([('url', 1)])


# 证监会
def csrc_crawler():
    # 行政处罚决定 + 市场禁入决定
    url_list = [
        {
Beispiel #24
0
import re
import jsbeautifier
import js2py

from pymongo import MongoClient
from init import logger_init, config_init
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin

logger = logger_init('中国人民银行-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入pbc_data这个collection
db.pbc_data.create_index([('url', 1)])

Beispiel #25
0
import re
from pymongo import MongoClient
from init import config_init, logger_init
from pyhanlp import *
from bson import ObjectId

config = config_init()
logger = logger_init('解析当事人')


def demo_chinese_name_recognition(sentence):
    """ 中国人名识别
    """
    segment = HanLP.newSegment().enableNameRecognize(
        True).enableOrganizationRecognize(True).enableJapaneseNameRecognize(
            True)
    return segment.seg(sentence)


if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
Beispiel #26
0
import re

from pymongo import MongoClient
from utility import request_site_page
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from init import logger_init, config_init

logger = logger_init('中国基金业协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

# 抓取数据存入amac_data这个collection
db.amac_data.create_index([('url', 1)])


def amac_crawler():
    result_list = []
    # 行业自律
    prefix_url = ['http://www.amac.org.cn/xxgs/jlcf/index']
    # 不予登记机构
Beispiel #27
0
from init import logger_init, config_init
from utility import request_site_page

# 云南省环境保护厅
# http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/wrygkzjcfjd/index.html
# http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/zjhjwfxwxqgzjd/index.html
# http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/jbzxcfqymd/index.html
url_format_list = [
    'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/wrygkzjcfjd/index{}.html',
    'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/zjhjwfxwxqgzjd/index{}.html',
    'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/jbzxcfqymd/index{}.html',
]
gov_name = '云南省环境保护厅'
collection_name = 'environment_data'

logger = logger_init(gov_name)
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]
Beispiel #28
0
from pymongo import MongoClient
from init import logger_init, config_init

logger = logger_init('深交所 数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]

    dev_db = MongoClient(config['mongodb']['ali_mongodb_url'],
                         username=config['mongodb']['ali_mongodb_username'],
                         password=config['mongodb']['ali_mongodb_password'],
                         port=int(config['mongodb']['ali_mongodb_port']))[
                             config['mongodb']['dev_mongodb_db_name']]

    touzhiwang_db = MongoClient(
        config['mongodb']['ali_mongodb_url'],
        username=config['mongodb']['ali_mongodb_username'],
        password=config['mongodb']['ali_mongodb_password'],
        port=int(config['mongodb']['ali_mongodb_port']))[
            config['mongodb']['szse_mongodb_db_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
Beispiel #29
0
import re

from pymongo import MongoClient
from init import logger_init, config_init
from bs4 import BeautifulSoup as bs
from utility import request_site_page
from urllib.parse import urljoin

logger = logger_init('地方证监局-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(
        host=config['mongodb']['mongodb_host'],
        port=int(config['mongodb']['mongodb_port']),
        username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'],
        password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
        config['mongodb']['mongodb_db_name']]

# 抓取数据存入csrc_data这个collection
db.csrc_data.create_index([('url', 1)])


def local_csrc_crawler():
    # 已有单独页面的行政处罚决定链接
    xzcf_url_list = [
        {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxyzl/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'},
        {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'},
Beispiel #30
0
from pymongo import MongoClient
import re
from utility import request_site_page
from bs4 import BeautifulSoup as bs
from init import logger_init, config_init
from urllib.parse import urljoin

logger = logger_init('注会协会-数据抓取')
config = config_init()
if config['mongodb']['dev_mongo'] == '1':
    db = MongoClient(config['mongodb']['ali_mongodb_url'],
                     username=config['mongodb']['ali_mongodb_username'],
                     password=config['mongodb']['ali_mongodb_password'],
                     port=int(config['mongodb']['ali_mongodb_port']))[
                         config['mongodb']['ali_mongodb_name']]
else:
    db = MongoClient(host=config['mongodb']['mongodb_host'],
                     port=int(config['mongodb']['mongodb_port']),
                     username=None if config['mongodb']['mongodb_username']
                     == '' else config['mongodb']['mongodb_username'],
                     password=None if config['mongodb']['mongodb_password']
                     == '' else config['mongodb']['mongodb_password'])[
                         config['mongodb']['mongodb_db_name']]

# 抓取数据存入cicpa_data这个collection
db.cicpa_data.create_index([('url', 1)])


def cicpa_crawler():
    result_list = []  # 用来保存最后存入数据库的数据
    prefix_url = [{