from elasticsearch import Elasticsearch, exceptions from elasticsearch.helpers import bulk import re from requests_html import HTML from pymongo import MongoClient from init import config_init, logger_init from xlrd import open_workbook, xldate_as_tuple import datetime from bs4 import BeautifulSoup as bs from bson import ObjectId config = config_init() logger = logger_init('迁移数据至 ES') es = Elasticsearch([config['Aliyun_ES']['host']], timeout=30) if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']]
from pymongo import MongoClient import re from bs4 import BeautifulSoup from utility import request_site_page from init import logger_init, config_init logger = logger_init('四川省财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def scczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{ 'url':
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('江西省财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def jxczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [ { 'url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_1.shtml', 'request_url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_',
import pdfplumber # imagemagick -> https://github.com/dahlia/wand/issues/327 from PIL import Image, ImageFile from bs4 import BeautifulSoup as bs from bs4 import Comment import requests from init import logger_init, config_init # from cmappings import apostrophe_map # from config import disabled_keys # from ocr_pdf import baidu_image_ocr # from oss_utils import initAliOSS, ossAddFile, ossGetFile, saveLocalFile, alioss_base_url log = logger_init('pdf2html') temp_dir = 'temp_pdf2html' rm_flag, img_flag, debug_mode = True, False, False # whether analyze images in pdf CHAR_SIZE_UPPER, CHAR_SIZE_LOWER = 30, 1 tocHeadRE = re.compile('^目录$') tocRE = re.compile(r"(.*?[.…·]+[0-9]+)") # re.compile('(.*?\.*[0-9]+)') numRE = re.compile(r".*?([0-9]+)") tocTextRE = re.compile(r"(.*?)[.…·]+[0-9]+$") # re.compile('(.*?).*[0-9]+$') tocLineRE = re.compile(r"^(.*?)[.…·]+[0-9]+$") tocRomanRE = re.compile(r"[.…·]{2,}[lxvi]+", re.IGNORECASE) # iv numRomanRE = re.compile(r"(.*?)([lxvi]+)", re.IGNORECASE) arabicTocRE = re.compile(r"^[0-9]+[.、·]+") pageNumRE = re.compile( r"^\d+(?:[--]+\d+)*$|^[--]+\d+[--]+$|^[--]*[lxvi]+[--]*$|^\d+(?:-\d+)*[--]+[lxvi]+$", re.I) # 1-1-1 or 11 or -1- or iv or 1-1-XVI
import xlwt from pymongo import MongoClient from datetime import datetime from init import config_init, logger_init logger = logger_init('导出数据到xlsx') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] def get_xlsx(sheet_name): workbook = xlwt.Workbook(encoding='ascii') worksheet = workbook.add_sheet(sheet_name) worksheet.write(0, 0, label='发文名称') worksheet.write(0, 1, label='文号') worksheet.write(0, 2, label='处罚日期')
import re from pymongo import MongoClient from utility import format_date, request_site_page from bs4 import BeautifulSoup as bs from init import logger_init, config_init from oss_utils import init_ali_oss, oss_add_file logger = logger_init('中国银行间市场交易商协会 数据解析') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] ali_bucket = init_ali_oss() def nafmii_parse(): for each_nafmii_document in db.nafmii_data.find( {'status': { '$nin': ['ignored']
from pymongo import MongoClient from init import logger_init, config_init import re import xlwt logger = logger_init('处罚决定 法律法规导出') config = config_init() db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] workbook = xlwt.Workbook(encoding='ascii') worksheet = workbook.add_sheet('保监|银监|人行') worksheet.write(0, 0, label='法律法规') all_law_list = [] # {'announcementOrg': {'$regex': '.*?(保监|银监|人行).*?'}} for each_announcement in db.punishAnnouncement.find(): punishment_decision = each_announcement[ 'punishmentBasement'] + '\n' + each_announcement['punishmentDecision'] punishment_decision = punishment_decision.replace('\r', '').replace( '\n', '').replace('\r\n', '') law_list = re.findall('(《.*?》((.*?))?)', punishment_decision) for each_law in law_list: if each_law not in all_law_list: all_law_list.append(each_law)
import re import jsbeautifier import js2py import time from pymongo import MongoClient from init import logger_init, config_init import requests from bs4 import BeautifulSoup as bs from urllib.parse import urljoin logger = logger_init('地方人民银行-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入pbc_data这个collection db.pbc_data.create_index([('url', 1)])
from pymongo import MongoClient import re from urllib.parse import urljoin from bs4 import BeautifulSoup from utility import request_site_page from init import logger_init, config_init logger = logger_init('黑龙江省律师协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入lawyers_data这个collection db.lawyers_data.create_index([('url', 1)]) def hljls_crawler(): result_list = [] # 用来保存最后存入数据库的数据
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('河北省财政局-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def hebcz_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('内蒙古自治区财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def nmgczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [ { 'url': 'http://www.nmgp.gov.cn/category/bgt', 'origin': '内蒙古自治区财政厅' }
from pymongo import MongoClient import re import json from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('福建省财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def fjczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据
import re from pymongo import MongoClient from init import logger_init, config_init from oss_utils import init_ali_oss, oss_add_file from bs4 import BeautifulSoup as bs from utility import request_site_page, get_year, cn2dig logger = logger_init('中国基金业协会 数据解析') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] ali_bucket = init_ali_oss() # 中国基金业协会 数据解析 def amac_parse(): for each_amac_document in db.amac_data.find({'status': {'$nin': ['ignored']}}): announcement_url = each_amac_document['url'] announcement_title = each_amac_document['title'] announcement_type = each_amac_document['type']
import re from pymongo import MongoClient from init import logger_init, config_init from utility import request_site_page, get_content_text, format_date from bs4 import BeautifulSoup as bs from oss_utils import init_ali_oss, oss_add_file logger = logger_init('证监会 数据解析') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] ali_bucket = init_ali_oss() # 证监会解析 def parse_csrc(url, doc_type, data_id, org): logger.info(doc_type) logger.info('url to parse ' + url) r = request_site_page(url) if r is None:
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('北京市财政局-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def bjccgp_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('甘肃省财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def gsczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
from pymongo import MongoClient import re from urllib.parse import urljoin from bs4 import BeautifulSoup from utility import request_site_page from init import logger_init, config_init logger = logger_init('广西自治区律师协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入lawyers_data这个collection db.lawyers_data.create_index([('url', 1)]) def gxlawyer_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('新疆维吾尔自治区财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def xjczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
from pymongo import MongoClient import requests from bs4 import BeautifulSoup as bs from urllib.parse import urljoin import math import re from init import logger_init, config_init logger = logger_init('辽宁省律师协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入lawyers_data这个collection db.lawyers_data.create_index([('url', 1)]) def lnlawyers_crawler(): result_list = [] # 用来保存最后存入数据库的数据
from pymongo import MongoClient import re from bs4 import BeautifulSoup from urllib.parse import urljoin from utility import request_site_page from init import logger_init, config_init logger = logger_init('山东省财政厅-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入finance_data这个collection db.finance_data.create_index([('url', 1)]) def sdczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{
import re from pymongo import MongoClient from utility import request_site_page from bs4 import BeautifulSoup as bs from urllib.parse import urljoin from init import logger_init, config_init logger = logger_init('交易商协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入nafmii_data这个collection db.nafmii_data.create_index([('url', 1)]) # 中国银行间市场交易商协会 def nafmii_crawler():
from pymongo import MongoClient from init import logger_init, config_init logger = logger_init('上交所 数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] dev_db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['dev_mongodb_db_name']] touzhiwang_db = MongoClient( config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['szse_mongodb_db_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
import re from pymongo import MongoClient from init import logger_init, config_init from bs4 import BeautifulSoup as bs from utility import request_site_page from urllib.parse import urljoin logger = logger_init('中国证监会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入csrc_data这个collection db.csrc_data.create_index([('url', 1)]) # 证监会 def csrc_crawler(): # 行政处罚决定 + 市场禁入决定 url_list = [ {
import re import jsbeautifier import js2py from pymongo import MongoClient from init import logger_init, config_init import requests from bs4 import BeautifulSoup as bs from urllib.parse import urljoin logger = logger_init('中国人民银行-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入pbc_data这个collection db.pbc_data.create_index([('url', 1)])
import re from pymongo import MongoClient from init import config_init, logger_init from pyhanlp import * from bson import ObjectId config = config_init() logger = logger_init('解析当事人') def demo_chinese_name_recognition(sentence): """ 中国人名识别 """ segment = HanLP.newSegment().enableNameRecognize( True).enableOrganizationRecognize(True).enableJapaneseNameRecognize( True) return segment.seg(sentence) if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password']
import re from pymongo import MongoClient from utility import request_site_page from bs4 import BeautifulSoup as bs from urllib.parse import urljoin from init import logger_init, config_init logger = logger_init('中国基金业协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入amac_data这个collection db.amac_data.create_index([('url', 1)]) def amac_crawler(): result_list = [] # 行业自律 prefix_url = ['http://www.amac.org.cn/xxgs/jlcf/index'] # 不予登记机构
from init import logger_init, config_init from utility import request_site_page # 云南省环境保护厅 # http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/wrygkzjcfjd/index.html # http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/zjhjwfxwxqgzjd/index.html # http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/jbzxcfqymd/index.html url_format_list = [ 'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/wrygkzjcfjd/index{}.html', 'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/zjhjwfxwxqgzjd/index{}.html', 'http://www.ynepb.gov.cn/wryhjjgxxgk/wryxxgkxzcf/jbzxcfqymd/index{}.html', ] gov_name = '云南省环境保护厅' collection_name = 'environment_data' logger = logger_init(gov_name) config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']]
from pymongo import MongoClient from init import logger_init, config_init logger = logger_init('深交所 数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] dev_db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['dev_mongodb_db_name']] touzhiwang_db = MongoClient( config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['szse_mongodb_db_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[
import re from pymongo import MongoClient from init import logger_init, config_init from bs4 import BeautifulSoup as bs from utility import request_site_page from urllib.parse import urljoin logger = logger_init('地方证监局-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[config['mongodb']['ali_mongodb_name']] else: db = MongoClient( host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入csrc_data这个collection db.csrc_data.create_index([('url', 1)]) def local_csrc_crawler(): # 已有单独页面的行政处罚决定链接 xzcf_url_list = [ {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxyzl/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'},
from pymongo import MongoClient import re from utility import request_site_page from bs4 import BeautifulSoup as bs from init import logger_init, config_init from urllib.parse import urljoin logger = logger_init('注会协会-数据抓取') config = config_init() if config['mongodb']['dev_mongo'] == '1': db = MongoClient(config['mongodb']['ali_mongodb_url'], username=config['mongodb']['ali_mongodb_username'], password=config['mongodb']['ali_mongodb_password'], port=int(config['mongodb']['ali_mongodb_port']))[ config['mongodb']['ali_mongodb_name']] else: db = MongoClient(host=config['mongodb']['mongodb_host'], port=int(config['mongodb']['mongodb_port']), username=None if config['mongodb']['mongodb_username'] == '' else config['mongodb']['mongodb_username'], password=None if config['mongodb']['mongodb_password'] == '' else config['mongodb']['mongodb_password'])[ config['mongodb']['mongodb_db_name']] # 抓取数据存入cicpa_data这个collection db.cicpa_data.create_index([('url', 1)]) def cicpa_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{