def __init__(self, json_restore_path=None): headers = { #'Connetion': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', "User-Agent": get_user_agent() } self.CR = CaptchaRecognition("hebei") self.requests = requests.Session() self.requests.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.requests.mount('http://', adapter) self.ents = {} self.json_dict = {} self.json_restore_path = json_restore_path self.csrf = "" #验证码图片的存储路径 self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg' #html数据的存储路径 self.html_restore_path = self.json_restore_path + '/hebei/' self.proxies = get_proxy('hebei') self.timeout = (30, 20)
def __init__(self): self.reqst = requests.Session() self.reqst.headers.update({ 'Connection': "keep-alive", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': get_user_agent() }) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.reqst.mount('http://', adapter) self.json_dict = {} print "In Crawler"
def __init__(self, json_restore_path=None): self.cur_time = str(int(time.time() * 1000)) self.nbxh = None self.reqst = requests.Session() self.json_restore_path = json_restore_path self.html_restore_path = self.json_restore_path + '/guizhou/' self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg' self.code_cracker = CaptchaRecognition('guizhou') self.result_json_dict = {} self.reqst.headers.update({ 'Connection': "keep-alive", 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': get_user_agent() }) self.mydict = { 'eareName': 'http://www.ahcredit.gov.cn', 'search': 'http://gsxt.gzgs.gov.cn/', 'searchList': 'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml', 'validateCode': 'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&' } self.one_dict = { u'基本信息': 'ind_comm_pub_reg_basic', u'股东信息': 'ind_comm_pub_reg_shareholder', u'发起人信息': 'ind_comm_pub_reg_shareholder', u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder', u'变更信息': 'ind_comm_pub_reg_modify', u'主要人员信息': 'ind_comm_pub_arch_key_persons', u'分支机构信息': 'ind_comm_pub_arch_branch', u'清算信息': 'ind_comm_pub_arch_liquidation', u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg', u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg', u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg', u'行政处罚信息': 'ind_comm_pub_administration_sanction', u'经营异常信息': 'ind_comm_pub_business_exception', u'严重违法信息': 'ind_comm_pub_serious_violate_law', u'抽查检查信息': 'ind_comm_pub_spot_check' } self.two_dict = { u'企业年报': 'ent_pub_ent_annual_report', u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution', u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution', u'股东及出资信息': 'ent_pub_shareholder_capital_contribution', u'股权变更信息': 'ent_pub_equity_change', u'行政许可信息': 'ent_pub_administration_license', u'知识产权出资登记': 'ent_pub_knowledge_property', u'知识产权出质登记信息': 'ent_pub_knowledge_property', u'行政处罚信息': 'ent_pub_administration_sanction', u'变更信息': 'ent_pub_shareholder_modify' } self.three_dict = { u'行政许可信息': 'other_dept_pub_administration_license', u'行政处罚信息': 'other_dept_pub_administration_sanction' } self.four_dict = { u'股权冻结信息': 'judical_assist_pub_equity_freeze', u'司法股权冻结信息': 'judical_assist_pub_equity_freeze', u'股东变更信息': 'judical_assist_pub_shareholder_modify', u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify' } self.result_json_dict = {}
from bs4 import BeautifulSoup from enterprise.libs.CaptchaRecognition import CaptchaRecognition import random import threading from common_func import get_proxy, exe_time, get_user_agent import gevent from gevent import Greenlet import gevent.monkey import traceback headers = { 'Connetion': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', "User-Agent": get_user_agent() } class SichuanCrawler(object): """ 四川爬虫, 继承object, 验证码与陕西一致。""" write_file_mutex = threading.Lock() def __init__(self, json_restore_path=None): self.pripid = None self.cur_time = str(int(time.time() * 1000)) self.reqst = requests.Session() self.reqst.headers.update(headers) adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) self.reqst.mount('http://', adapter) self.json_restore_path = json_restore_path