예제 #1
0
    def initNbiter(self, module_super):
        iterator = Iterator("qynb_list", "nianb")
        module = Module(None, u"获取公司年报", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"获取年报详情")

        def prepare(nianb):
            mv_dict = dict()
            mv_dict['nb_url'] = nianb[0]
            mv_dict['nb_name'] = nianb[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepare)
        sub_module.appendUrl('nb_url')
        sub_module.appendHeaders({
            'Host': 'www.sgs.gov.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        })
        module.appendSubModule(sub_module)
예제 #2
0
    def initAnnualReport(self, module_super):
        iterator = Iterator(seeds="nb_list",param_name="nb")
        module=Module(None, u"遍历企业年报列表获取Url",iterator)
        module_super.appendSubModule(module)

    #     self.initAnnualReportInfo(module)
    #
    # def initAnnualReportInfo(self, module_super):
    #     module = Module(self.visitQynb, u"获取企业年报详细信息")

        sub_module = Module(self.visitQynb, u"获取企业年报详细信息")
        def annual_convert(nb):
            con_dict = dict()
            con_dict["nb_url"] = "http://aic.hainan.gov.cn:1888%s" % ''.join(nb.xpath("@href"))
            con_dict["nb_name"] = ''.join(nb.xpath("text()")).replace(u"年度报告", "")
            return con_dict
        sub_module.appendInput(InputType.FUNCTION, input_value=annual_convert)
        sub_module.appendUrl("nb_url")
        sub_module.appendHeaders(
            lambda company_url:
            {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Cache-Control": "max-age=0",
                "Connection": "keep-alive",
                "Host": "aic.hainan.gov.cn:1888",
                "Referer": company_url,
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
            }
        )
        sub_module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
예제 #3
0
 def initChangeInfoPage(self, module_super):
     iterator = Iterator("bgxx_pages", "page_no")
     module = Module(None, u"进入变更信息翻页", iterator)
     module_super.appendSubModule(module)
     sub_module = Module(self.visitBgxx, u"获取变更翻页信息")
     sub_module.appendUrl(
         lambda qyid:
         "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
         % qyid)
     sub_module.appendWebMethod("post")
     sub_module.appendPostData(lambda page_no: {
         'page': page_no,
         'limit': 5,
         'mark': 0
     })
     sub_module.appendHeaders(
         lambda ua: {
             'Host': 'gsxt.jxaic.gov.cn',
             'Connection': 'keep-alive',
             'User-Agent': ua,
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
         })
     module.appendSubModule(sub_module, True)
예제 #4
0
    def initConfigCompanyInfo(self):
        iterator = Iterator("search_list", "info")
        module = Module(None, "获取公司信息", iterator)
        self.module_manager.appendSubModule(module, True)

        self.initCompanyPrepare(module)
        self.initRouter(module)
예제 #5
0
 def initShareHolderDetail(self, module_super):
     iterator = Iterator("xh_pripid", "xh_prid")
     module = Module(None, "进入股东详情", iterator)
     module.module_id = "fetch_gdxq_info"
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东翻页信息")
     sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do')
     sub_module.appendHeaders(
         lambda ua: {
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Host": "gsxt.scaic.gov.cn",
             "Origin": "http://gsxt.scaic.gov.cn",
             "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
             "Connection": "keep-alive"
         })
     sub_module.appendWebMethod("post")
     sub_module.appendPostData(
         lambda xh_prid: {
             'maent.pripid': xh_prid[1],
             'maent.entbigtype': xh_prid[0],
             'random': str(int(time.time() * 1000)),
             'method': 'tzrCzxxDetial',
             'random': str(int(time.time() * 1000))
         })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module.appendSubModule(sub_module, True)
예제 #6
0
    def initChangeInfoPage(self, module_super):
        iterator = Iterator("bgxx_pages", "page_no")
        module = Module(None, "进入变更信息翻页", iterator)
        module_super.appendSubModule(module)

        sub_module = Module(self.visitBgxx, "获取变更翻页信息")
        sub_module.appendUrl(
            lambda qyid:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
            % qyid)
        sub_module.appendWebMethod("post")
        sub_module.appendPostData(lambda page_no: {
            'page': page_no,
            'limit': 5,
            'mark': 0
        })
        sub_module.appendHeaders({
            'Host':
            'gsxt.jxaic.gov.cn',
            'Connection':
            'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, sdch',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendSubModule(sub_module)
예제 #7
0
    def initAnnalsInfo(self, module_super):
        iterator = Iterator("annals_list", "annal")
        module = Module(None, u"获取年报信息", iterator)
        module.module_id = "get_annals_info"

        module_super.appendSubModule(module, True)

        self.initAnnalsDetails(module)
예제 #8
0
    def initConfigCompanyInfo(self):
        iterator = Iterator("search_list", "info")
        module = Module(None, u"获取公司信息", iterator)
        self.module_manager.appendSubModule(module, True)

        self.initCompanyInfoPrepare(module)
        self.initConfigBaseInfo(module)
        self.initGdxq(module)
        self.initNianBao(module)
        self.initNbiter(module)
        self.initResultCollect(module)
예제 #9
0
 def initConfigCompanyInfo(self):
     iterator = Iterator("tag_alist", "tag_a")
     module = Module(None, u"获取公司信息", iterator)
     self.module_manager.appendSubModule(module, True)
     self.initConfigCompanyInfoPre(module)
     self.initConfigBaseInfo(module)
     self.initArchiveInfo(module)
     self.initPenaltyInfo(module)
     self.initShareHolderDetail(module)
     self.initAnnualReportPre(module)
     self.initAnnualReport(module)
     self.initResultCollect(module)
예제 #10
0
 def initConfigCompanyInfo(self):
     iterator = Iterator("search_list", "com")
     module = Module(None, u"获取公司信息", iterator)
     self.module_manager.appendSubModule(module, True)
     self.initCompanyInfo(module)
     self.initBaxxInfo(module)
     self.initFzhgInfo(module)
     self.initXzcfxxInfo(module)
     self.initGdxqInfoPrepare(module)
     self.initAnnualReportPre(module)
     self.initAnnualReport(module)
     self.initResultCollect(module)
예제 #11
0
    def fetchStockholderInfo(self, module_super):
        """
        遍历股东信息分页, 并再次迭代股东详情列表
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxx_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历股东翻页")
        module.module_id = "gdxx_pages"
        module_super.appendSubModule(module)

        self.getStockholderInfo(module)
        self.fetchStockholderDetail(module)
예제 #12
0
    def getAnnalsInfo(self, module_super):
        """
        遍历年报列表
        :param module_super:
        :return:
        """
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://218.95.241.36' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
예제 #13
0
    def fetchCompanyInfo(self):
        iterator = Iterator("search_list", "com")
        module = Module(None, u"处理公司列表", iterator)
        # 保存验证码图片
        module.appendExtraFunction(self.yzmSave)
        self.module_manager.appendSubModule(module, True)

        self.prepareCompnanyParms(module)
        self.getCompanyInfo(module)
        self.fetchCmpnyGdxq(module)
        self.getCompanyRecordInfo(module)
        self.getCompanyPunishInfo(module)
        # self.getAnnalsList(module)
        # self.getAnnalsInfo(module)
        self.initResultCollect(module)
예제 #14
0
    def initConfigCompanyInfo(self):
        iterator = Iterator("search_list", "com")
        module = Module(None, u"获取公司信息", iterator)
        # 保存验证码图片
        module.appendExtraFunction(self.yzmSave)
        self.module_manager.appendSubModule(module, True)

        self.initConfigBaseInfo(module)
        self.initConfigShareHolderInfo(module)
        self.initConfigChangeInfo(module)
        self.initArchiveInfo(module)
        self.initBranchInfo(module)
        self.initPenaltyInfo(module)
        self.initAnnalsList(module)
        self.initAnnalsInfo(module)
        self.initResultCollect(module)
예제 #15
0
    def getAnnalsInfo(self, module_super):
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://gsxt.hljaic.gov.cn' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
예제 #16
0
    def initShareholderInfoDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, u"开始获取股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, u"获取股东详情信息")
        def getGdxqUrl(gdxx_rcd):
            for key in gdxx_rcd:
                if 'onclick' in gdxx_rcd[key]:
                    onclick_dict = eval(gdxx_rcd[key]) if isinstance(gdxx_rcd[key], basestring) else gdxx_rcd[key]
                    onclick = onclick_dict["onclick"]
                    xq_link = onclick[onclick.find('(')+1:onclick.find(')')].replace("'", "")
                    xq_url = "http://aic.hainan.gov.cn:1888" + xq_link
                    return xq_url
            return None
        sub_module.appendUrl(getGdxqUrl)
        sub_module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
예제 #17
0
    def initConfigCompanyInfo(self):
        iterator = Iterator("search_list", "com")
        module = Module(None, "获取公司信息", iterator)
        self.module_manager.appendSubModule(module, True)

        self.initCompanyInfoPrepare(module)
        self.initConfigBaseInfo(module)
        self.initTopInfo(module)

        self.initConfigShareHolderInfo(module)
        self.initShareHolderInfoPage(module)
        self.initShareHolderDetail(module)

        self.initConfigChangeInfo(module)
        self.initChangeInfoPage(module)

        self.initArchiveInfo(module)
        self.initResultCollect(module)
예제 #18
0
    def initShareHolderDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, "进入股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, "获取股东详情信息")
        sub_module.appendUrl(self.getGdxqUrl)
        sub_module.appendHeaders({
            'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Connection': 'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
            'Host': 'gsxt.jxaic.gov.cn',
            'Cache-Control': 'max-age=0'
        })
        module.appendSubModule(sub_module)
예제 #19
0
    def initGdxq(self, module_super):
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")
        sub_module.appendUrl("gdxq")
        module.appendHeaders(
            lambda gdxq: {
                'Host': 'www.sgs.gov.cn',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                'Accept-Encoding': 'gzip, deflate',
                'Referer': gdxq
            })
        module.appendSubModule(sub_module)
예제 #20
0
 def initGdxqInfoPrepare(self, module_super):
     iterator = Iterator("recid_list", "rid")
     module = Module(None, u"进入股东详情", iterator)
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东详情")
     # def pri_c(rid, com):
     #     print 'xxxxxx===>>>', rid
     # sub_module.appendOutput(type=OutputType.FUNCTION, function=pri_c)
     sub_module.appendUrl(lambda rid, com: 'http://218.57.139.24/pub/gsnzczxxdetail/%s/%s'%(com[2], rid.strip()))
     sub_module.appendHeaders(lambda ua, com: {
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Referer':'http://218.57.139.24/pub/'+com[1],})
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module.appendSubModule(sub_module, True)
예제 #21
0
    def initShareHolderDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, u"进入股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, u"获取股东详情信息")
        sub_module.appendUrl(self.getGdxqUrl)
        sub_module.appendHeaders(
            lambda ua: {
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Connection': 'keep-alive',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'User-Agent': ua,
                'Host': 'gsxt.jxaic.gov.cn',
                'Cache-Control': 'max-age=0'
            })
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.appendSubModule(sub_module, True)
예제 #22
0
    def fetchCmpnyGdxq(self, module_super):
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        # TODO: 添加try exception
        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
예제 #23
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
예제 #24
0
    def initBranchInfoPage(self, module_super):
        iterator = Iterator("fzjg_page_range", "pno")
        module = Module(None, u"第八步_获取分支机构_开始翻页数据", iterator)
        module.module_id = "fzjg_pages"
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitFzjg, u"第八步_获取分支机构翻页数据")
        sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryChildList.jspx?mainId=%s&pno=%s" %(id, pno))
        sub_module.appendHeaders(lambda company_url:
            {
                'Host': "aic.hainan.gov.cn:1888",
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'Referer': "http://aic.hainan.gov.cn:1888" + company_url,
                'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8'
            }
        )
        module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
예제 #25
0
    def fetchCompanyInfo(self):
        """
        创建模块分支, 迭代公司列表
        :return:
        """
        iterator = Iterator("search_list", "com")
        module = Module(None, u"处理公司列表", iterator)
        # 保存验证码图片
        module.appendExtraFunction(self.yzmSave)
        self.module_manager.appendSubModule(module, True)

        self.prepareCompnanyParms(module)
        self.getCompanyInfo(module)
        self.fetchStockholderDetail(module)
        self.fetchStockholderInfo(module)
        self.fetchChangeInfo(module)
        self.fetchRecordInfo(module)
        self.fetchBranchInfo(module)
        self.fetchPunishInfo(module)
        self.getAnnalsList(module)
        self.getAnnalsInfo(module)
        self.initResultCollect(module)
예제 #26
0
    def initShareholderInfoPage(self, module_super):
        iterator = Iterator("gdxx_page_range", "pno")
        module = Module(None, u"第五步_获取股东信息_开始翻页数据", iterator)
        module.module_id = "gdxx_pages"
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxx, u"第五步_获取股东信息翻页数据")
        sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryInvList.jspx?mainId=%s&pno=%s" % (id, pno))
        sub_module.appendHeaders(lambda company_url:
            {
                'Host': "aic.hainan.gov.cn:1888",
                'Connection': 'keep-alive',
                'Cache-Control': 'max-age=0',
                'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer': "http://aic.hainan.gov.cn:1888" + company_url,
                'Accept-Encoding': 'gzip, deflate',
                "Accept-Language": "zh-CN,zh;q=0.8"
            }
        )
        module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
예제 #27
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" %
            (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Accept':
                '*/*',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept-Encoding':
                'gzip, deflate, sdch',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
예제 #28
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return "http://218.95.241.36" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
예제 #29
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s"
            % (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
예제 #30
0
    def initCompanyInfo(self):
        iterator = Iterator("search_list", "com")
        module = Module(None, u"获取公司信息", iterator)
        self.module_manager.appendSubModule(module, True)

        self.initUrlParams(module)

        self.initBasicInfo(module)

        self.initShareholderInfoPage(module)
        self.initShareholderInfoDetail(module)

        self.initChangeInfoPage(module)

        self.initArchiveInfoPage(module)
        # self.initArchiveInfoTwoPage(module)

        self.initBranchInfoPage(module)

        self.initAnnualReportList(module)
        self.initAnnualReport(module)
        # self.initAnnualReportInfo(module)

        self.initResultCollect(module)