Ejemplo n.º 1
0
    def __init__(self, json_restore_path=None):
        headers = {  #'Connetion': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            "User-Agent": get_user_agent()
        }
        self.CR = CaptchaRecognition("hebei")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.csrf = ""
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/hebei/'

        self.proxies = get_proxy('hebei')

        self.timeout = (30, 20)
Ejemplo n.º 2
0
    def __init__(self):
        self.reqst = requests.Session()
        self.reqst.headers.update({
            'Connection': "keep-alive",
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'Content-Type': 'application/x-www-form-urlencoded',
            'User-Agent': get_user_agent()
        })
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_dict = {}

        print "In Crawler"
Ejemplo n.º 3
0
    def __init__(self, json_restore_path=None):
        self.cur_time = str(int(time.time() * 1000))
        self.nbxh = None
        self.reqst = requests.Session()
        self.json_restore_path = json_restore_path
        self.html_restore_path = self.json_restore_path + '/guizhou/'
        self.ckcode_image_path = self.json_restore_path + '/guizhou/ckcode.jpg'
        self.code_cracker = CaptchaRecognition('guizhou')
        self.result_json_dict = {}
        self.reqst.headers.update({
            'Connection': "keep-alive",
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            'User-Agent': get_user_agent()
        })

        self.mydict = {
            'eareName':
            'http://www.ahcredit.gov.cn',
            'search':
            'http://gsxt.gzgs.gov.cn/',
            'searchList':
            'http://gsxt.gzgs.gov.cn/search!searchSczt.shtml',
            'validateCode':
            'http://gsxt.gzgs.gov.cn/search!generateCode.shtml?validTag=searchImageCode&'
        }

        self.one_dict = {
            u'基本信息': 'ind_comm_pub_reg_basic',
            u'股东信息': 'ind_comm_pub_reg_shareholder',
            u'发起人信息': 'ind_comm_pub_reg_shareholder',
            u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
            u'变更信息': 'ind_comm_pub_reg_modify',
            u'主要人员信息': 'ind_comm_pub_arch_key_persons',
            u'分支机构信息': 'ind_comm_pub_arch_branch',
            u'清算信息': 'ind_comm_pub_arch_liquidation',
            u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
            u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
            u'行政处罚信息': 'ind_comm_pub_administration_sanction',
            u'经营异常信息': 'ind_comm_pub_business_exception',
            u'严重违法信息': 'ind_comm_pub_serious_violate_law',
            u'抽查检查信息': 'ind_comm_pub_spot_check'
        }

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {
            u'行政许可信息': 'other_dept_pub_administration_license',
            u'行政处罚信息': 'other_dept_pub_administration_sanction'
        }
        self.four_dict = {
            u'股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
            u'股东变更信息': 'judical_assist_pub_shareholder_modify',
            u'司法股东变更登记信息': 'judical_assist_pub_shareholder_modify'
        }
        self.result_json_dict = {}
Ejemplo n.º 4
0
from bs4 import BeautifulSoup
from enterprise.libs.CaptchaRecognition import CaptchaRecognition
import random
import threading

from common_func import get_proxy, exe_time, get_user_agent
import gevent
from gevent import Greenlet
import gevent.monkey
import traceback

headers = {
    'Connetion': 'Keep-Alive',
    'Accept': 'text/html, application/xhtml+xml, */*',
    'Accept-Language': 'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
    "User-Agent": get_user_agent()
}


class SichuanCrawler(object):
    """ 四川爬虫, 继承object, 验证码与陕西一致。"""
    write_file_mutex = threading.Lock()

    def __init__(self, json_restore_path=None):
        self.pripid = None
        self.cur_time = str(int(time.time() * 1000))
        self.reqst = requests.Session()
        self.reqst.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_restore_path = json_restore_path