コード例 #1
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
    def initBasicInfo(self, module_super):
        module = Module(self.visitJbxx, u"第四步_获取基本信息")

        def getparams(company_url):
            query = {}
            for quy in map(lambda par: par.split("="), urlparse.urlparse(company_url).query.split("&")):
                query[quy[0]] = quy[1]
            print query
            return query
        module.appendInput(InputType.FUNCTION, getparams)
        module.appendUrl(lambda id: "http://aic.hainan.gov.cn:1888/businessPublicity.jspx?id=%s" % id)

        module.appendHeaders(
            {
                "Host": "aic.hainan.gov.cn:1888",
                "Connection": "keep-alive",
                "Cache-Control": "max-age=0",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
                "Referer": "http://aic.hainan.gov.cn:1888/searchList.jspx",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8"
            }
        )
        module.addSleep(Sleep(2))

        module_super.appendSubModule(module, True)

        # 股东信息页码获取
        def getGdxxPageno(html):
            fenye_xpath = ".//div[@id='invDiv']/following-sibling::table[1]|.//div[@id='invDiv']/following-sibling::div[1]"
            gdxx_tree = etree.HTML(html)
            fenye_table = gdxx_tree.xpath(fenye_xpath)
            if not fenye_table:
                return []
            fenye_table = fenye_table[0]
            pageno = self.parse_pageno(fenye_table)
            if pageno <= 1:
                return []
            self.holder.logging.info("------------------股东信息页码:" + str(pageno) + "---------------------")
            return range(2, int(pageno) + 1)

        module.appendOutput(name="gdxx_page_range", type=OutputType.FUNCTION, function=getGdxxPageno, show_up=OutputParameterShowUpType.OPTIONAL)
        def bypass_fun_gdxx(gdxx_page_range=None):
            if not gdxx_page_range:
                return True
            else:
                return False
        module.appendBypass(Bypass(condition_fuc=bypass_fun_gdxx, module_id="gdxx_pages"))

        # 变更信息页码获取
        def getBgxxPageno(html):
            fenye_xpath = ".//div[@id='altPagination']/table[1]|.//div[@id='altDiv']/following-sibling::table[1]"
            bgxx_tree = etree.HTML(html)
            fenye_table = bgxx_tree.xpath(fenye_xpath)
            if not fenye_table:
                return []
            fenye_table = fenye_table[0]
            pageno = self.parse_pageno(fenye_table)
            if pageno <= 1:
                return []
            return range(2, pageno + 1)
        module.appendOutput(name="bgxx_page_range", type=OutputType.FUNCTION, function=getBgxxPageno, show_up=OutputParameterShowUpType.OPTIONAL)
        def bypass_fun_bgxx(bgxx_page_range=None):
            if not bgxx_page_range:
                return True
            else:
                return False
        module.appendBypass(Bypass(condition_fuc=bypass_fun_bgxx,module_id="bgxx_pages"))

        # 备案信息页码提取
        def getbaxxPageno(html):
            fenye_xpath = ".//div[@id='memDiv']/following-sibling::table[1]|.//*[@id='beian']/table[2]"
            baxx_tree = etree.HTML(html)
            fenye_table = baxx_tree.xpath(fenye_xpath)
            if not fenye_table:
                return []
            fenye_table = fenye_table[0]
            pageno = self.parse_pageno(fenye_table)
            if pageno <= 1:
                return []
            return range(2, pageno + 1)
        module.appendOutput(name="baxx_page_range",type=OutputType.FUNCTION,function=getbaxxPageno,show_up=OutputParameterShowUpType.OPTIONAL)
        def bypass_fun_baxx(baxx_page_range=None):
            if not baxx_page_range:
                return  True
            else:
                return False
        module.appendBypass(Bypass(condition_fuc=bypass_fun_baxx,module_id="baxx_pages"))

        # 备案信息url区分,如果没有备案信息数据则不请求分页网址
        # def getbaxx_url(html):
        #     data_xpath = ".//div[@id='memDiv']"
        #     tree = etree.HTML(html)
        #     data_trs = tree.xpath(data_xpath)
        #     if not data_trs:
        #         return True
        #     else:
        #         return False
        # module.appendBypass(Bypass(condition_fuc=getbaxx_url,module_id="baxx_pages"))

        # 分支机构页码提取
        def getfzjgPageno(html):
            fenye_xpath = ".//div[@id='childPagination']/table[1]|.//div[@id='childDiv']/following-sibling::table[1]"
            fzjg_tree = etree.HTML(html)
            fenye_table = fzjg_tree.xpath(fenye_xpath)
            if not fenye_table:
                return []
            fenye_table = fenye_table[0]
            pageno = self.parse_pageno(fenye_table)
            if pageno <= 1:
                return []
            return range(2, pageno + 1)
        module.appendOutput(name="fzjg_page_range",type=OutputType.FUNCTION,function=getfzjgPageno,show_up=OutputParameterShowUpType.OPTIONAL)
        def bypass_fun_fzjg(fzjg_page_range=None):
            if not fzjg_page_range:
                return True
            else:
                return False
        module.appendBypass(Bypass(condition_fuc=bypass_fun_fzjg,module_id="fzjg_pages"))
コード例 #2
0
    def getCompanyInfo(self, module_super):
        module = Module(self.visitJbxx, u"抓取公司信息")
        module.module_id = "get_cmpny_info"

        def getURL(company_url):
            if "http" in company_url:
                return company_url
            else:
                return u"http://www.nmgs.gov.cn:7001/aiccips" + company_url.replace(
                    '..', '')

        module.appendUrl(getURL)

        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        module.appendOutput(name="gdxq_list",
                            xpath=".//*[@id='invInfo']/table/tr/td/a/@onclick",
                            type=OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(name="params_list",
                            xpath='.//input[@type=\'hidden\']/@value',
                            type=OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))

        module_super.appendSubModule(module, True)

        def checkParmsList(params_list=None):
            if params_list:
                return False
            return True

        module.appendBypass(
            Bypass(condition_fuc=checkParmsList,
                   module_id="get_record_info",
                   range_global=True))
        module.appendBypass(
            Bypass(condition_fuc=checkParmsList,
                   module_id="get_punish_info",
                   range_global=True))