コード例 #1
0
    def initConfigBaseInfo(self, module_super):
        module = Module(self.visitJbxx, u"基本信息")
        module.appendPostData(
            lambda pripid, entbigtype: {
                'djjg': '',
                'maent.entbigtype': entbigtype,
                'maent.pripid': pripid,
                'method': 'qyInfo',
                'random': str(int(time.time() * 1000))
            })
        module.appendWebMethod("post")
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
                "Connection": "keep-alive"
            })
        module.appendOutput("company_zch_list",
                            '//table[1]/tr[2]/td[1]/text()', OutputType.LIST)
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))

        def setCompanyZch(company_zch_list=None):
            self.value_dict['company_zch'] = company_zch_list[
                0] if company_zch_list else None

        module.appendOutput(type=OutputType.FUNCTION, function=setCompanyZch)

        #对公司名字和注册号码断言
        def assertNameZch(company_name=None, company_zch=None):
            if company_name and company_zch and (0 < len(company_name) <
                                                 100) and (0 < len(company_zch)
                                                           < 100):
                #self.report.access_type = SeedAccessType.OK
                return True
            return False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=5,
                  assert_function=assertNameZch))

        #从基本信息页面中提取股东详情的参数,组成访问股东详情的post参数
        def getXhPripid(html):
            return re.findall(r'\s+onclick="showRyxx\(\'(.+?)\'\,\'(.+?)\'\)"',
                              html, re.S)

        module.appendOutput(name='xh_pripid',
                            type=OutputType.FUNCTION,
                            function=getXhPripid,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
コード例 #2
0
    def checkCompanyName(self):
        module = Module(self.getWebHtml, u'验证公司名称')
        module.appendUrl(
            "http://gsxt.scaic.gov.cn/keyword.do?method=keywordFilter&random="
            + str(int(time.time())))
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
                "Connection": "keep-alive"
            })
        module.appendPostData(lambda company_key: {'qymc': company_key})
        module.appendWebMethod('post')

        def assertRecode(html):
            if self.report.access_type == SeedAccessType.NON_COMPANY:
                self.report.access_type = SeedAccessType.ERROR
            return True if html and html.strip() == '1' else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=assertRecode,
                  redo_module="check_validatecode"))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addSleep(Sleep(3))
        self.module_manager.appendSubModule(module, True)
コード例 #3
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
    def initSubmitYzm(self):
        module = Module(self.getJson, u"第二步_提交验证码验证")
        module.appendUrl("http://aic.hainan.gov.cn:1888/checkCheckNo.jspx")
        module.appendHeaders(
            {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                "Host": "aic.hainan.gov.cn:1888",
                "Origin": "http://aic.hainan.gov.cn:1888",
                "Proxy-Connection": "keep-alive",
                "Referer": "http://aic.hainan.gov.cn:1888/search.jspx",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
                "X-Requested-With": "XMLHttpRequest"
            }
        )
        module.appendWebMethod("post")
        module.appendPostData(lambda yzm: {"checkNo": yzm})
        module.addSleep(Sleep(3))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=0, redo_module="hn_yzm_pic"))

        # 验证码判断条件
        def submitYzmResult(json=None):
            if not json:
                return False
            if '{success:true}' not in json:
                return False
            return True
        module.addEvent(Event(EventType.ASSERT_FAILED, assert_function=submitYzmResult, retry_times=0, redo_module="hn_yzm_pic"))
        self.module_manager.appendSubModule(module)
コード例 #4
0
ファイル: CrawlerZhejiang.py プロジェクト: chybot/crawler
    def initChangeInfoPage(self, module_super):
        iterator = Iterator("bgxx_pages", "page_no")
        module = Module(None, "进入变更信息翻页", iterator)
        module_super.appendSubModule(module)

        sub_module = Module(self.visitBgxx, "获取变更翻页信息")
        sub_module.appendUrl(
            lambda qyid:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
            % qyid)
        sub_module.appendWebMethod("post")
        sub_module.appendPostData(lambda page_no: {
            'page': page_no,
            'limit': 5,
            'mark': 0
        })
        sub_module.appendHeaders({
            'Host':
            'gsxt.jxaic.gov.cn',
            'Connection':
            'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate, sdch',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendSubModule(sub_module)
コード例 #5
0
 def initPenaltyInfo(self, module_super):
     module = Module(self.visitXzcf, u"获取行政处罚信息")
     module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
     module.appendHeaders(
         lambda ua: {
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Host": "gsxt.scaic.gov.cn",
             "Origin": "http://gsxt.scaic.gov.cn",
             "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
             "Connection": "keep-alive"
         })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda pripid, entbigtype: {
             'czmk': 'czmk3',
             'maent.entbigtype': entbigtype,
             'maent.pripid': pripid,
             'method': 'cfInfo',
             'random': str(int(time.time() * 1000))
         })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module, True)
コード例 #6
0
ファイル: CrawlerShanghai.py プロジェクト: chybot/crawler
 def initSearchPage(self, module_super):
     module = Module(self.visitSearchList, u"搜索列表-翻页")
     module.appendUrl('https://www.sgs.gov.cn/notice/search/ent_info_list')
     module.appendHeaders({
         'Host':
         'www.sgs.gov.cn',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language':
         'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept-Encoding':
         'gzip, deflate',
         'Referer':
         'https://www.sgs.gov.cn/notice/home',
         'Content-Type':
         'application/x-www-form-urlencoded'
     })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda token, company_key, page_no: {
             'searchType': '1',
             'captcha': 0,
             'session.token': token,
             'condition.keyword': company_key,
             'condition.pageNo': page_no
         })
     module.appendOutput("search_list", './/div[@class="list-item"]',
                         OutputType.LIST)
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=2))
     module_super.appendSubModule(module)
コード例 #7
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
 def initSearchList(self):
     module = Module(self.visitSearchList,u"第三步_开始搜索公司列表")
     module.appendUrl("http://aic.hainan.gov.cn:1888/searchList.jspx")
     module.appendHeaders(
         {
             "Host": "aic.hainan.gov.cn:1888",
             "Connection": "keep-alive",
             "Cache-Control": "max-age=0",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
             "Origin": "http://aic.hainan.gov.cn:1888",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
             "Content-Type": "application/x-www-form-urlencoded",
             "Referer": "http://aic.hainan.gov.cn:1888/search.jspx",
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN,zh;q=0.8"
         }
     )
     module.appendWebMethod("post")
     module.appendPostData(lambda yzm, company_key:{
         "checkNo": yzm,
         "entName": company_key
     })
     module.appendOutput("url_list", ".//div[@class='list']//a/@href", OutputType.LIST)
     module.appendOutput("name_list", ".//div[@class='list']//a/text()", OutputType.LIST)
     module.appendOutput(name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=20, redo_module="hn_yzm_pic"))
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=20, redo_module="hn_yzm_pic"))
     module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda :False if self.report.access_type == SeedAccessType.NON_COMPANY else True))
     module.appendMiddleValueMonitor("search_list")
     module.addSleep(Sleep(1))
     self.module_manager.appendSubModule(module)
コード例 #8
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
 def initChangeInfoPage(self, module_super):
     iterator = Iterator("bgxx_pages", "page_no")
     module = Module(None, u"进入变更信息翻页", iterator)
     module_super.appendSubModule(module)
     sub_module = Module(self.visitBgxx, u"获取变更翻页信息")
     sub_module.appendUrl(
         lambda qyid:
         "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
         % qyid)
     sub_module.appendWebMethod("post")
     sub_module.appendPostData(lambda page_no: {
         'page': page_no,
         'limit': 5,
         'mark': 0
     })
     sub_module.appendHeaders(
         lambda ua: {
             'Host': 'gsxt.jxaic.gov.cn',
             'Connection': 'keep-alive',
             'User-Agent': ua,
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
         })
     module.appendSubModule(sub_module, True)
コード例 #9
0
 def initShareHolderDetail(self, module_super):
     iterator = Iterator("xh_pripid", "xh_prid")
     module = Module(None, "进入股东详情", iterator)
     module.module_id = "fetch_gdxq_info"
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东翻页信息")
     sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do')
     sub_module.appendHeaders(
         lambda ua: {
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Host": "gsxt.scaic.gov.cn",
             "Origin": "http://gsxt.scaic.gov.cn",
             "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
             "Connection": "keep-alive"
         })
     sub_module.appendWebMethod("post")
     sub_module.appendPostData(
         lambda xh_prid: {
             'maent.pripid': xh_prid[1],
             'maent.entbigtype': xh_prid[0],
             'random': str(int(time.time() * 1000)),
             'method': 'tzrCzxxDetial',
             'random': str(int(time.time() * 1000))
         })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module.appendSubModule(sub_module, True)
コード例 #10
0
    def getCmpnySereachList(self):
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, textfield: {
            "code": yzm,
            "textfield": textfield.replace(r"\n", "")
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
コード例 #11
0
    def checkValidateCode(self):
        module = Module(self.getJson, u"检验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'X-Requested-With':
            'XMLHttpRequest',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "code": yzm,
            "textfield": company_key
        })

        def checkValidatecode(web=None):
            if not web:
                return False
            else:
                pattern = re.compile(r'\"([\s\S]*?)\"')
                flags = pattern.findall(str(web.body))
                if (len(flags) != 4 or flags[2] != 'textfield') or (
                        flags[0] == 'flag' and flags[1] != str(1)):
                    self.holder.logging.warning(u"验证码校验失败!")
                    return False
                else:
                    self.value_dict["textfield"] = flags[3].decode(
                        'raw_unicode_escape')
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
コード例 #12
0
ファイル: CrawlerQinghai.py プロジェクト: chybot/crawler
    def getCmpnySereachList(self):
        """
        抓取公司列表
        :output: url_list, name_list, search_list
        :return:
        """
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl("http://218.95.241.36/searchList.jspx")
        module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Origin": "http://218.95.241.36",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": "http://218.95.241.36/search.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "checkNo": yzm,
            "entName": company_key
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
コード例 #13
0
ファイル: CrawlerJilin.py プロジェクト: chybot/crawler
 def initConfigSearchList(self):
     module = Module(self.visitSearchList, u"搜索列表")
     module.appendUrl('http://211.141.74.198:8081/aiccips/pub/indsearch')
     module.appendHeaders({
         'Connection':
         'keep-alive',
         'Accept-Language':
         'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
         'Accept-Encoding':
         'gzip, deflate',
         'Cache-Control':
         'max-age=0',
         'Referer':
         'http://211.141.74.198:8081/aiccips/',
         'Host':
         '211.141.74.198:8081',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0'
     })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda csrf, yzm, company_key: {
             'kw': company_key,
             '_csrf': csrf[-1],  # 参数不为空由首页输出模块保证,且此参数为必选参数,故未做判断直接使用
             'secode': getMd5WithString(yzm)
         })
     module.appendCookie("cookie")
     module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                         OutputType.LIST)
     module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                         OutputType.LIST)
     module.appendOutput(
         name="search_list",
         type=OutputType.FUNCTION,
         function=lambda url_list, name_list: zip(url_list, name_list))
     module.addEvent(
         Event(EventType.EXCEPTION_OCCURED,
               retry_times=100,
               redo_module="module_cookie"))
     module.addEvent(
         Event(EventType.OUTPUT_NOT_SATISFIED,
               retry_times=100,
               redo_module="module_cookie"))
     module.addEvent(
         Event(EventType.ASSERT_FAILED,
               retry_times=0,
               assert_function=lambda: False if self.report.access_type ==
               SeedAccessType.NON_COMPANY else True))
     module.addSleep(Sleep(3))
     self.module_manager.appendSubModule(module)
コード例 #14
0
ファイル: CrawlerZhejiang.py プロジェクト: chybot/crawler
    def initPostParam(self):
        def postSearchListData(json, yzm):
            if 'textfield' not in json:
                return None
            textfield = json['textfield']
            data = {"textfield": textfield, "code": yzm}
            return data

        def postDataJsonAssert(json=None):
            if not json:
                return False
            if 'flag' not in json or json['flag'] != '1':
                return False
            return True

        module = Module(self.getJson, "json中间结果")
        module.appendUrl(
            "http://gsxt.zjaic.gov.cn/search/doValidatorVerifyCode.do")
        module.appendHeaders({
            'Host': 'gsxt.zjaic.gov.cn',
            'User-Agent':
            "Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0",
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
            'Referer':
            'http://gsxt.zjaic.gov.cn/search/doEnGeneralQueryPage.do',
            'Connection': 'keep-alive',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache'
        })
        module.appendWebMethod("post")
        module.appendPostData(lambda company_key, yzm: {
            "name": company_key,
            "verifyCode": yzm
        })

        # module.appendOutput(name = "post_data", type = OutputType.FUNCTION, function = postSearchListData)
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  redo_module="module_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  redo_module="module_validate_code"))
        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  assert_function=postDataJsonAssert,
                  redo_module="module_validate_code"))

        self.module_manager.appendSubModule(module)
コード例 #15
0
 def initConfigSearchList(self):
     module = Module(self.visitSearchList, "搜索列表")
     module.appendUrl('http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html')
     module.appendHeaders({'Host': 'gsxt.gdgs.gov.cn', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
                           'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/',
                           'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive',
                           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0'})
     module.appendWebMethod("post")
     module.appendPostData("post_data")
     module.appendOutput("search_list", ".//*[@class='list']", OutputType.LIST)
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_validate_code"))
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_validate_code"))
     module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda:False if self.report.access_type == SeedAccessType.NON_COMPANY else True))
     self.module_manager.appendSubModule(module)
コード例 #16
0
    def getCmpnySereachList(self):
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl("http://gsxt.hljaic.gov.cn/searchList.jspx")
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "text/plain, */*; q=0.01",
            "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "checkNo": yzm,
            "entName": company_key
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
コード例 #17
0
ファイル: CrawlerJilin.py プロジェクト: chybot/crawler
 def initPenaltyInfo(self, module_super):
     module = Module(self.visitXzcf, u"获取行政处罚信息")
     module.appendUrl(
         lambda params: 'http://211.141.74.198:8081/aiccips/pub/gsxzcfxx')
     module.appendHeaders(
         lambda csrf: {
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
             'Cache-Control': 'max-age=0',
             'Connection': 'keep-alive',
             'Host': '211.141.74.198:8081',
             'X-CSRF-TOKEN': csrf[-1]
         })
     module.appendWebMethod("post")
     module.appendPostData(lambda params: {'encrpripid': params[0]})
     module_super.appendSubModule(module)
コード例 #18
0
ファイル: CrawlerQinghai.py プロジェクト: chybot/crawler
    def checkValidateCode(self):
        """
        对上一个模块产生的验证码进行校验
        :return:
        """
        module = Module(self.getJson, u"校验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl('http://218.95.241.36/checkCheckNo.jspx')
        module.appendHeaders({
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Proxy-Connection": "keep-alive",
            "Host": "218.95.241.36",
            "Origin": "http://218.95.241.36",
            "Referer": "http://218.95.241.36/search.jspx",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm: {"checkNo": yzm})

        def checkValidatecode(json=None):
            if not json or "{success:true}" not in json:
                self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码")
                return False
            else:
                return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
コード例 #19
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
 def initXzcfxxInfo(self, module_super):
     module = Module(self.visitXzcfJson, u"获取行政处罚信息")
     module.appendUrl("http://218.57.139.24/pub/gsxzcfxx")
     module.appendHeaders(lambda com, csrf, ua:{
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "application/json, text/javascript, */*; q=0.01",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
         'Referer':'http://218.57.139.24/pub/'+com[1],
         'X-CSRF-TOKEN':csrf[0],
         'X-Requested-With':'XMLHttpRequest'})
     module.appendWebMethod("post")
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module.appendPostData(lambda com: {'encrpripid': com[2]})
     module_super.appendSubModule(module, True)
コード例 #20
0
    def initQynbInfo(self, module_super):
        module = Module(None, u"设置年份")

        def saveNbyear(qynb_year):
            if qynb_year and qynb_year.strip():
                self.value_dict['nb_name'] = qynb_year.strip()

        module.appendOutput(type=OutputType.FUNCTION, function=saveNbyear)
        module_super.appendSubModule(module, True)

        module = Module(self.visitQynb, u"获取企业年报")
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
                "Connection": "keep-alive"
            })
        module.appendWebMethod("post")
        module.appendPostData(
            lambda pripid, qynb_year: {
                'maent.nd': qynb_year.strip(),
                'maent.pripid': pripid,
                'method': 'ndbgDetail',
                'random': str(int(time.time() * 1000))
            })
        module.appendOutput(
            "nb_zch", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[1]/text()',
            OutputType.LIST)  #, show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(
            "nb_qym", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[2]/text()',
            OutputType.LIST)  #, show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=5))  #粗略检测年报信息有没有注册号和企业名(没有可能会因为访问太快,没数据回来)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
コード例 #21
0
    def getCompanyRecordInfo(self, module_super):
        module = Module(self.visitBaxx, u"抓取备案信息")
        module.module_id = "get_record_info"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entCheckInfo"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Referer':
            "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entInfo",
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))
        module.appendWebMethod("post")
        module.appendPostData(
            lambda params_list: {
                "entNo": str(params_list[1]),
                "entType": str(params_list[2]),
                "regOrg": str(params_list[3])
            })

        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))

        module_super.appendSubModule(module, True)
コード例 #22
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
    def assertValidateCode(self):
        module = Module(self.getJson, u"验证码")
        module.module_id = "checkValidateCode"
        module.appendUrl(
            'http://gsxt.jxaic.gov.cn/ECPS/home/home_homeSearchYzm.pt')
        module.appendHeaders(
            lambda ua: {
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
                'Accept-Encoding': 'gzip, deflate',
                'Connection': 'keep-alive',
                'Content-Type':
                'application/x-www-form-urlencoded; charset=UTF-8',
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'User-Agent': ua,
                'Host': 'gsxt.jxaic.gov.cn',
                'Referer': 'http://gsxt.jxaic.gov.cn/ECPS/'
            })
        module.appendWebMethod("post")
        module.appendPostData(lambda yzm, company_key: {
            "search": company_key,
            "yzm": yzm
        })

        def assertVaildCode(json=None):
            if self.report.access_type == SeedAccessType.NON_COMPANY:
                self.report.access_type = SeedAccessType.ERROR
            return True if json and (
                json.get("msg") == 'true'
            ) else False  #json {u'msg': u'true', u'success': True}

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=assertVaildCode,
                  redo_module="module_validate_code"))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addSleep(Sleep(3))
        self.module_manager.appendSubModule(module, True)
コード例 #23
0
    def initAnnualReportPre(self, module_super):
        module = Module(self.getWebHtml, u"获取年报年份列表")
        module.module_id = "fetch_qynb_list"
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
                "Connection": "keep-alive"
            })
        module.appendWebMethod("post")

        def getQynbParamList(html):
            if html:
                rs = re.findall(r'\s+onclick="doNdbg\(\'(\d{4})\'\);"', html,
                                re.S)
                return list(set(rs))
            return []

        module.appendPostData(
            lambda pripid: {
                'czmk': 'czmk8',
                'maent.pripid': pripid,
                'method': 'qygsInfo',
                'random': str(int(time.time() * 1000))
            })
        module.appendOutput(name="qinb_param_list",
                            type=OutputType.FUNCTION,
                            function=getQynbParamList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
コード例 #24
0
 def initConfigSearchList(self):
     module = Module(self.visitSearchList, u"搜索列表")
     module.appendUrl('http://xyjg.egs.gov.cn/ECPS_HB/searchList.jspx')
     module.appendHeaders(
         lambda ua: {
             "Host": "xyjg.egs.gov.cn",
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Content-Type": "application/x-www-form-urlencoded",
             "Referer": "http://xyjg.egs.gov.cn/ECPS_HB/search.jspx"
         })
     module.appendWebMethod("post")
     module.appendPostData(lambda company_key, yzm: {
         'checkNo': yzm,
         'entName': company_key
     })
     module.appendOutput("search_list", "//*[@class='list']/ul/li/a/@href",
                         OutputType.LIST)
     module.addEvent(
         Event(EventType.EXCEPTION_OCCURED,
               retry_times=100,
               redo_module="module_home_page"))
     module.addEvent(
         Event(EventType.OUTPUT_NOT_SATISFIED,
               retry_times=100,
               redo_module="module_home_page"))
     module.addEvent(
         Event(EventType.ASSERT_FAILED,
               retry_times=0,
               assert_function=lambda: False if self.report.access_type ==
               SeedAccessType.NON_COMPANY else True))
     module.appendMiddleValueMonitor("search_list")
     self.module_manager.appendSubModule(module)
コード例 #25
0
    def checkValidateCode(self):
        module = Module(self.getJson, u"校验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl('http://gsxt.hljaic.gov.cn/checkCheckNo.jspx')
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "image/png,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm: {"checkNo": yzm})

        def checkValidatecode(json=None):
            if not json or "{success:true}" not in json:
                self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码")
                return False
            else:
                return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
コード例 #26
0
# -*- coding: utf-8 -*-
コード例 #27
0
ファイル: CrawlerZhejiang.py プロジェクト: chybot/crawler
    def initConfigSearchList(self):
        module = Module(self.visitSearchList, "搜索列表")
        module.appendUrl(
            'http://gsxt.jxaic.gov.cn/ECPS/home/home_homeSearch.pt')
        module.appendHeaders({
            'Host':
            'gsxt.jxaic.gov.cn',
            'Referer':
            'http://gsxt.jxaic.gov.cn/qyxxgsAction_queryXyxx.action',
            'Accept-Encoding':
            'gzip, deflate',
            'Cache-Control':
            'max-age=0',
            'Accept-Language':
            'en-US,en;q=0.8',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Connection':
            'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
        })
        module.appendWebMethod("post")
        module.appendPostData(lambda yzm, company_key: {
            "search": company_key,
            "yzm": yzm,
        })
        module.appendOutput("url_list", './/*[@class="list"]/div/a/@href',
                            OutputType.LIST)
        module.appendOutput("name_list",
                            './/*[@class="list"]/div/a/font/text()',
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))
        module.appendOutput("name_invalid_list",
                            ".//*[@id='div0']/div[1]/text()",
                            OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput("status_invalid_list",
                            ".//*[@id='div0']/div[2]/span[2]/text()",
                            OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput("page_nos",
                            ".//*[@id='form1']//div//td[@align]/text()",
                            show_up=OutputParameterShowUpType.OPTIONAL)

        # Todo 搜索列表页翻页
        def page_range(page_nos):
            if not page_nos:
                return None
            page_str = page_nos.strip()
            page_str = page_str[3:]
            page_str = page_str[page_str.find('共') + 3:page_str.find('页')]
            return range(2, int(page_str) + 1)

        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=2,
                  redo_module="module_validate_code"))

        def assert_func(url_list, name_invalid_list, html):
            if not url_list and name_invalid_list:
                self.report.access_type = SeedAccessType.NO_VALID_COMPANY
                self.holder.logging.info("无有效公司列表!")
                return False
            if '无数据' in html:
                self.report.access_type = SeedAccessType.NON_COMPANY
                self.holder.logging.info("无此公司!")
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=assert_func))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="module_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="module_validate_code"))
        self.module_manager.appendSubModule(module)
コード例 #28
0
# -*- coding: utf-8 -*-
コード例 #29
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
    def get_search_list(self):
        module = Module(self.visitSearchList, u"搜索列表")
        module.appendUrl('http://218.57.139.24/pub/indsearch')
        module.appendHeaders(lambda ua : {
            "Host": "218.57.139.24",
            "User-Agent": ua,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            'Referer':'http://218.57.139.24/'
        })
        module.appendWebMethod("post")
        def assertCsrf(csrf):
            return True if isinstance(csrf, list) and len(csrf) > 0 else False
        module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertCsrf, redo_module="home_page"))
        def assertVaildCode(yzm, html=None):
            if not html:
                return False
            # print 'type(html)  ', type(html)
            # if u'计算错误' in html:
            #     print u'cout << 计算错误'
            # elif u'验证码超时,请重新计算' in html:
            #     print u'cout << 验证码超时,请重新计算'
            # else:
            #     print 'yzm True, ', yzm
            return False if u'计算错误' in html or u'验证码超时,请重新计算' in html else True

        def md5(str):
            m = hashlib.md5()
            m.update(str)
            return m.hexdigest()
        module.appendPostData(lambda company_key, yzm, csrf: {'kw': company_key, '_csrf': csrf[0], 'secode':md5(str(yzm))})
        module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertVaildCode, redo_module="home_page"))
        def assertProxyStatus(html):
            if  u"每天最多可搜索" in html:
                #print u'切换代理'
                download = DownLoader('shandong')
                download.changeProxy()
                return False
            return True
        module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=20, assert_function=assertProxyStatus, redo_module="home_page"))
        module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=5, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True))
        module.appendOutput("search_list_xpath", '//ul/li[@class="font16"]/a', OutputType.LIST)
        def getSearchList(search_list_xpath=None):
            ret_args = []
            if not search_list_xpath:
                return []
            for xpath_a in search_list_xpath:
                com_name = xpath_a.xpath('./text()')
                com_href = xpath_a.xpath('./@href')
               # print com_name,     com_href
                if com_name and com_href:
                    args = com_href[0].split('/')
                    if len(args) == 3:   #URL的参数个数变?
                        com_num = args[1].strip()
                        encrpripid = args[2].strip()
                        if com_num and encrpripid:
                            ret_args.append((com_name[0].strip(), com_href[0].strip(), encrpripid, com_num))
            return ret_args
        module.appendOutput(name="search_list", type=OutputType.FUNCTION, function=getSearchList, show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
        module.addSleep(Sleep(2))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module='home_page'))
        self.module_manager.appendSubModule(module, True)
コード例 #30
0
    def initConfigSearchList(self):
        module = Module(self.visitSearchList, u"搜索列表")
        module.module_id = "get_search_list"
        module.appendUrl(
            "http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=" +
            str(int(time.time()) * 1000))
        module.appendWebMethod("post")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
                "Connection": "keep-alive"
            })

        def getPostData(yzm, company_key):
            if isinstance(company_key, unicode):
                company_key = company_key.encode('gb2312')
            else:
                charcode = chardet.detect(company_key).get('encoding')
                if charcode:
                    company_key = company_key.decode(charcode).encode('gb2312')
            rs_dict = {
                'currentPageNo': '1',
                'yzm': yzm,
                'maent.entname': company_key,
                "pName": u'请输入营业执照注册号或统一社会信用代码'.encode('gb2312')
            }
            return rs_dict

        module.appendPostData(getPostData)
        module.appendOutput(name="yzm_flag",
                            type=OutputType.FUNCTION,
                            function=self.getYzmFlag,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        def assertYzmFlag(yzm_flag=None):
            return True if yzm_flag == 'yes' else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=assertYzmFlag,
                  redo_module='check_validatecode'))

        def assertNoCompany(yzm_flag=None):
            return False if yzm_flag == 'yes' and self.report.access_type == SeedAccessType.NON_COMPANY else True

        module.addEvent(
            Event(
                EventType.ASSERT_FAILED,
                retry_times=6,
                assert_function=assertNoCompany,
                redo_module="check_validatecode"))  #获取公司搜索列表(公司名和onclick事件参数)
        module.appendOutput(
            "search_list_xpath", './/ul/li[@class="font16"]/a',
            OutputType.LIST)  #,show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(name="tag_alist",
                            type=OutputType.FUNCTION,
                            function=self.getSearchList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="check_validatecode"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module='check_validatecode'))
        module.addSleep(Sleep(3))
        self.module_manager.appendSubModule(module, True)