Exemple #1
0
    def __init__(self, json_restore_path=None):
        headers = {  #'Connetion': 'Keep-Alive',
            'Accept': 'text/html, application/xhtml+xml, */*',
            'Accept-Language':
            'en-US, en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
            "User-Agent": get_user_agent()
        }
        self.CR = CaptchaRecognition("hebei")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_dict = {}
        self.json_restore_path = json_restore_path
        self.csrf = ""
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/hebei/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/hebei/'

        self.proxies = get_proxy('hebei')

        self.timeout = (30, 20)
Exemple #2
0
 def __init__(self, json_restore_path=None):
     super(XinjiangCrawler, self).__init__(json_restore_path)
     self.json_restore_path = json_restore_path
     #验证码图片的存储路径
     self.path_captcha = self.json_restore_path + '/xinjiang/ckcode.jpeg'
     #html数据的存储路径
     self.html_restore_path = self.json_restore_path + '/xinjiang/'
     self.proxies = get_proxy('xinjiang')
Exemple #3
0
    def __init__(self, json_restore_path=None):
        super(YunnanCrawler, self).__init__(json_restore_path)
        self.json_restore_path = json_restore_path
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/yunnan/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/yunnan/'

        self.proxies = get_proxy('yunnan')
Exemple #4
0
 def __init__(self, json_restore_path=None):
     super(HubeiCrawler, self).__init__(json_restore_path)
     self.json_restore_path = json_restore_path
     #html数据的存储路径
     self.html_restore_path = self.json_restore_path + '/hubei/'
     #验证码图片的存储路径
     self.ckcode_image_path = self.json_restore_path + '/hubei/ckcode.jpg'
     self.parser = HubeiParser(self)
     self.proxies = get_proxy('hubei')
    def __init__(self, json_restore_path=None):
        super(ShanghaiCrawler, self).__init__(json_restore_path)
        self.json_restore_path = json_restore_path
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/Shanghai/ckcode.jpeg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/Shanghai/'

        self.proxies = get_proxy('shanghai')
Exemple #6
0
    def __init__(self, json_restore_path=None):
        super(XizangCrawler, self).__init__(json_restore_path)
        self.json_restore_path = json_restore_path
        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/xizang/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/xizang/ckcode.jpg'
        self.parser = XizangParser(self)
        self.proxies = get_proxy('xizang')
Exemple #7
0
    def __init__(self, json_restore_path=None):
        super(LiaoningCrawler, self).__init__()
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/liaoning/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/liaoning/ckcode.jpg'
        self.parser = LiaoningParser(self)
        self.proxies = get_proxy('liaoning')
        self.timeout = (30, 20)
Exemple #8
0
    def __init__(self, json_restore_path=None):
        super(ZongjuCrawler, self).__init__()
        self.json_restore_path = json_restore_path

        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/zongju/'
        # 验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/zongju/ckcode.jpg'

        self.parser = ZongjuParser(self)
        self.proxies = get_proxy('beijing')
        self.timeout = (30, 20)
Exemple #9
0
    def __init__(self, json_restore_path=None):

        super(HainanCrawler, self).__init__(json_restore_path)
        # HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/hainan/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/hainan/ckcode.jpg'

        self.parser = HainanParser(self)

        self.proxies = get_proxy('hainan')
Exemple #10
0
    def __init__(self, json_restore_path=None):

        super(ShanxiCrawler, self).__init__(json_restore_path)
        # HeilongjiangClawer.__init__(self, json_restore_path)
        self.json_restore_path = json_restore_path
        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/shanxi/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/shanxi/ckcode.jpg'

        self.parser = ShanxiParser(self)

        self.proxies = get_proxy('shanxi')
Exemple #11
0
    def __init__(self, json_restore_path=None):
        # Crawler.__init__(self)
        super(HeilongjiangClawer, self).__init__()

        self.json_restore_path = json_restore_path
        # html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/heilongjiang/'

        # 验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/heilongjiang/ckcode.jpg'

        self.parser = HeilongjiangParser(self)

        self.proxies = get_proxy('heilongjiang')

        self.timeout = (30, 20)
    def __init__(self, json_restore_path=None):
        self.html_showInfo = None
        self.Captcha = None
        self.CR = CaptchaRecognition("guangdong")
        self.requests = requests.Session()
        self.requests.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100,
                                                pool_maxsize=100)
        self.requests.mount('http://', adapter)

        self.ents = {}
        self.json_restore_path = json_restore_path
        self.dir_restore_path = self.json_restore_path + '/neimenggu/'
        #验证码图片的存储路径
        self.path_captcha = self.json_restore_path + '/neimenggu/ckcode.jpg'
        self.timeout = (30, 20)
        proxies = get_proxy('neimenggu')
        if proxies:
            print proxies
            self.requests.proxies = proxies
Exemple #13
0
    def __init__(self, json_restore_path=None):
        self.pripid = None
        self.cur_time = str(int(time.time() * 1000))
        self.reqst = requests.Session()
        self.reqst.headers.update(headers)
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.reqst.mount('http://', adapter)
        self.json_restore_path = json_restore_path
        self.ckcode_image_path = self.json_restore_path + '/sichuan/ckcode.jpg'
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/sichuan/'
        self.code_cracker = CaptchaRecognition('sichuan')
        self.result_json_dict = {}
        self.json_list = []

        proxies = get_proxy('shaanxi')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.ents = {}

        self.mydict = {
            'eareName': 'http://www.ahcredit.gov.cn',
            'search': 'http://gsxt.scaic.gov.cn/ztxy.do?method=index&random=',
            'searchList':
            'http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=',
            'validateCode': 'http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm'
        }

        self.one_dict = {u'基本信息': 'ind_comm_pub_reg_basic',
                         u'股东信息': 'ind_comm_pub_reg_shareholder',
                         u'发起人信息': 'ind_comm_pub_reg_shareholder',
                         u'股东(发起人)信息': 'ind_comm_pub_reg_shareholder',
                         u'变更信息': 'ind_comm_pub_reg_modify',
                         u'主要人员信息': 'ind_comm_pub_arch_key_persons',
                         u'分支机构信息': 'ind_comm_pub_arch_branch',
                         u'清算信息': 'ind_comm_pub_arch_liquidation',
                         u'动产抵押登记信息': 'ind_comm_pub_movable_property_reg',
                         u'股权出置登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'股权出质登记信息': 'ind_comm_pub_equity_ownership_reg',
                         u'行政处罚信息': 'ind_comm_pub_administration_sanction',
                         u'经营异常信息': 'ind_comm_pub_business_exception',
                         u'严重违法信息': 'ind_comm_pub_serious_violate_law',
                         u'抽查检查信息': 'ind_comm_pub_spot_check'}

        self.two_dict = {
            u'企业年报': 'ent_pub_ent_annual_report',
            u'企业投资人出资比例': 'ent_pub_shareholder_capital_contribution',
            u'股东(发起人)及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息(币种与注册资本一致)': 'ent_pub_shareholder_capital_contribution',
            u'股东及出资信息': 'ent_pub_shareholder_capital_contribution',
            u'股权变更信息': 'ent_pub_equity_change',
            u'行政许可信息': 'ent_pub_administration_license',
            u'知识产权出资登记': 'ent_pub_knowledge_property',
            u'知识产权出质登记信息': 'ent_pub_knowledge_property',
            u'行政处罚信息': 'ent_pub_administration_sanction',
            u'变更信息': 'ent_pub_shareholder_modify'
        }
        self.three_dict = {u'行政许可信息': 'other_dept_pub_administration_license',
                           u'行政处罚信息': 'other_dept_pub_administration_sanction'}
        self.four_dict = {u'股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'司法股权冻结信息': 'judical_assist_pub_equity_freeze',
                          u'股东变更信息': 'judical_assist_pub_shareholder_modify',
                          u'司法股东变更登记信息':
                          'judical_assist_pub_shareholder_modify'}
Exemple #14
0
    def __init__(self, json_restore_path=None):
        """
        初始化函数
        Args:
            json_restore_path: json文件的存储路径,所有江苏的企业,应该写入同一个文件,因此在多线程爬取时设置相同的路径。同时,
             需要在写入文件的时候加锁
        Returns:
        """
        super(JiangsuCrawler, self).__init__()
        self.json_restore_path = json_restore_path
        #html数据的存储路径
        self.html_restore_path = self.json_restore_path + '/jiangsu/'

        #验证码图片的存储路径
        self.ckcode_image_path = self.json_restore_path + '/jiangsu/ckcode.jpg'

        # self.proxies = {}
        proxies = get_proxy('jiangsu')
        if proxies:
            print proxies
            self.reqst.proxies = proxies
        self.timeout = (30, 20)
        self.parser = JiangsuParser(self)

        self.corp_org = ''
        self.corp_id = ''
        self.corp_seq_id = ''
        self.common_enter_post_data = {}
        self.ci_enter_post_data = {}
        self.nb_enter_post_data = {}
        self.post_info = {
            'ind_comm_pub_reg_basic': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter',
                'specificQuery': 'basicInfo'
            },
            'ind_comm_pub_reg_shareholder': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'investmentInfor'
            },
            'ind_comm_pub_reg_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'biangeng'
            },
            'ind_comm_pub_arch_key_persons': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'personnelInformation'
            },
            'ind_comm_pub_arch_branch': {
                'url_type': 'ci_enter',
                'post_type': 'ci_enter_with_recordline',
                'specificQuery': 'branchOfficeInfor'
            },
            #'ind_comm_pub_arch_liquadition': {'url_type': 'ci_enter', 'post_type': 'common_enter', 'specificQuery': 'qsfzr'},
            'ind_comm_pub_movable_property_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'dongchan'
            },
            'ind_comm_pub_equity_ownership_reg': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'guquanchuzhi'
            },
            'ind_comm_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'chufa'
            },
            'ind_comm_pub_business_exception': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'abnormalInfor'
            },
            #'ind_comm_pub_serious_violate_law': {'url_type': 'common_enter', 'post_type': 'common_enter', 'propertiesName': 'xxx'},
            'ind_comm_pub_spot_check': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'checkup'
            },
            'ind_comm_pub_reg_shareholder_detail': {
                'url_type': 'ci_detail',
                'post_type': 'ci_detail',
                'specificQuery': 'investorInfor'
            },
            'ent_pub_annual_report': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_report_list'
            },
            'annual_report_detail': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter'
            },
            'ent_pub_shareholder_capital_contribution': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_tzcz'
            },
            'ent_pub_administrative_license': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzxk'
            },
            'ent_pub_knowledge_property': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_zscq'
            },
            'ent_pub_administration_sanction': {
                'url_type': 'nb_enter',
                'post_type': 'nb_enter',
                'propertiesName': 'query_xzcf'
            },
            'other_dept_pub_administration_license': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzheng'
            },
            'other_dept_pub_administration_sanction': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'xingzhengchufa'
            },
            'judical_assist_pub_equity_freeze': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gqdjList'
            },
            'judical_assist_pub_shareholder_modify': {
                'url_type': 'common_enter',
                'post_type': 'common_enter',
                'propertiesName': 'gdbgList'
            }
        }