def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, u"基本信息") module.appendPostData( lambda pripid, entbigtype: { 'djjg': '', 'maent.entbigtype': entbigtype, 'maent.pripid': pripid, 'method': 'qyInfo', 'random': str(int(time.time() * 1000)) }) module.appendWebMethod("post") module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index", "Connection": "keep-alive" }) module.appendOutput("company_zch_list", '//table[1]/tr[2]/td[1]/text()', OutputType.LIST) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5)) def setCompanyZch(company_zch_list=None): self.value_dict['company_zch'] = company_zch_list[ 0] if company_zch_list else None module.appendOutput(type=OutputType.FUNCTION, function=setCompanyZch) #对公司名字和注册号码断言 def assertNameZch(company_name=None, company_zch=None): if company_name and company_zch and (0 < len(company_name) < 100) and (0 < len(company_zch) < 100): #self.report.access_type = SeedAccessType.OK return True return False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=5, assert_function=assertNameZch)) #从基本信息页面中提取股东详情的参数,组成访问股东详情的post参数 def getXhPripid(html): return re.findall(r'\s+onclick="showRyxx\(\'(.+?)\'\,\'(.+?)\'\)"', html, re.S) module.appendOutput(name='xh_pripid', type=OutputType.FUNCTION, function=getXhPripid, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def checkCompanyName(self): module = Module(self.getWebHtml, u'验证公司名称') module.appendUrl( "http://gsxt.scaic.gov.cn/keyword.do?method=keywordFilter&random=" + str(int(time.time()))) module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index", "Connection": "keep-alive" }) module.appendPostData(lambda company_key: {'qymc': company_key}) module.appendWebMethod('post') def assertRecode(html): if self.report.access_type == SeedAccessType.NON_COMPANY: self.report.access_type = SeedAccessType.ERROR return True if html and html.strip() == '1' else False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertRecode, redo_module="check_validatecode")) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module, True)
def initSubmitYzm(self): module = Module(self.getJson, u"第二步_提交验证码验证") module.appendUrl("http://aic.hainan.gov.cn:1888/checkCheckNo.jspx") module.appendHeaders( { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "aic.hainan.gov.cn:1888", "Origin": "http://aic.hainan.gov.cn:1888", "Proxy-Connection": "keep-alive", "Referer": "http://aic.hainan.gov.cn:1888/search.jspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } ) module.appendWebMethod("post") module.appendPostData(lambda yzm: {"checkNo": yzm}) module.addSleep(Sleep(3)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=0, redo_module="hn_yzm_pic")) # 验证码判断条件 def submitYzmResult(json=None): if not json: return False if '{success:true}' not in json: return False return True module.addEvent(Event(EventType.ASSERT_FAILED, assert_function=submitYzmResult, retry_times=0, redo_module="hn_yzm_pic")) self.module_manager.appendSubModule(module)
def initChangeInfoPage(self, module_super): iterator = Iterator("bgxx_pages", "page_no") module = Module(None, "进入变更信息翻页", iterator) module_super.appendSubModule(module) sub_module = Module(self.visitBgxx, "获取变更翻页信息") sub_module.appendUrl( lambda qyid: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s" % qyid) sub_module.appendWebMethod("post") sub_module.appendPostData(lambda page_no: { 'page': page_no, 'limit': 5, 'mark': 0 }) sub_module.appendHeaders({ 'Host': 'gsxt.jxaic.gov.cn', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendSubModule(sub_module)
def initPenaltyInfo(self, module_super): module = Module(self.visitXzcf, u"获取行政处罚信息") module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index", "Connection": "keep-alive" }) module.appendWebMethod("post") module.appendPostData( lambda pripid, entbigtype: { 'czmk': 'czmk3', 'maent.entbigtype': entbigtype, 'maent.pripid': pripid, 'method': 'cfInfo', 'random': str(int(time.time() * 1000)) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def initSearchPage(self, module_super): module = Module(self.visitSearchList, u"搜索列表-翻页") module.appendUrl('https://www.sgs.gov.cn/notice/search/ent_info_list') module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.sgs.gov.cn/notice/home', 'Content-Type': 'application/x-www-form-urlencoded' }) module.appendWebMethod("post") module.appendPostData( lambda token, company_key, page_no: { 'searchType': '1', 'captcha': 0, 'session.token': token, 'condition.keyword': company_key, 'condition.pageNo': page_no }) module.appendOutput("search_list", './/div[@class="list-item"]', OutputType.LIST) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=2)) module_super.appendSubModule(module)
def initSearchList(self): module = Module(self.visitSearchList,u"第三步_开始搜索公司列表") module.appendUrl("http://aic.hainan.gov.cn:1888/searchList.jspx") module.appendHeaders( { "Host": "aic.hainan.gov.cn:1888", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Origin": "http://aic.hainan.gov.cn:1888", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://aic.hainan.gov.cn:1888/search.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" } ) module.appendWebMethod("post") module.appendPostData(lambda yzm, company_key:{ "checkNo": yzm, "entName": company_key }) module.appendOutput("url_list", ".//div[@class='list']//a/@href", OutputType.LIST) module.appendOutput("name_list", ".//div[@class='list']//a/text()", OutputType.LIST) module.appendOutput(name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=20, redo_module="hn_yzm_pic")) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=20, redo_module="hn_yzm_pic")) module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda :False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.appendMiddleValueMonitor("search_list") module.addSleep(Sleep(1)) self.module_manager.appendSubModule(module)
def initChangeInfoPage(self, module_super): iterator = Iterator("bgxx_pages", "page_no") module = Module(None, u"进入变更信息翻页", iterator) module_super.appendSubModule(module) sub_module = Module(self.visitBgxx, u"获取变更翻页信息") sub_module.appendUrl( lambda qyid: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s" % qyid) sub_module.appendWebMethod("post") sub_module.appendPostData(lambda page_no: { 'page': page_no, 'limit': 5, 'mark': 0 }) sub_module.appendHeaders( lambda ua: { 'Host': 'gsxt.jxaic.gov.cn', 'Connection': 'keep-alive', 'User-Agent': ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendSubModule(sub_module, True)
def initShareHolderDetail(self, module_super): iterator = Iterator("xh_pripid", "xh_prid") module = Module(None, "进入股东详情", iterator) module.module_id = "fetch_gdxq_info" module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东翻页信息") sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do') sub_module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do", "Connection": "keep-alive" }) sub_module.appendWebMethod("post") sub_module.appendPostData( lambda xh_prid: { 'maent.pripid': xh_prid[1], 'maent.entbigtype': xh_prid[0], 'random': str(int(time.time() * 1000)), 'method': 'tzrCzxxDetial', 'random': str(int(time.time() * 1000)) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module.appendSubModule(sub_module, True)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, textfield: { "code": yzm, "textfield": textfield.replace(r"\n", "") }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def checkValidateCode(self): module = Module(self.getJson, u"检验验证码") module.module_id = "check_validate_code" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'http://www.nmgs.gov.cn:7001', 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.addSleep(Sleep(3)) module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "code": yzm, "textfield": company_key }) def checkValidatecode(web=None): if not web: return False else: pattern = re.compile(r'\"([\s\S]*?)\"') flags = pattern.findall(str(web.body)) if (len(flags) != 4 or flags[2] != 'textfield') or ( flags[0] == 'flag' and flags[1] != str(1)): self.holder.logging.warning(u"验证码校验失败!") return False else: self.value_dict["textfield"] = flags[3].decode( 'raw_unicode_escape') return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): """ 抓取公司列表 :output: url_list, name_list, search_list :return: """ module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://218.95.241.36/searchList.jspx") module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Origin": "http://218.95.241.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://218.95.241.36/search.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def initConfigSearchList(self): module = Module(self.visitSearchList, u"搜索列表") module.appendUrl('http://211.141.74.198:8081/aiccips/pub/indsearch') module.appendHeaders({ 'Connection': 'keep-alive', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'max-age=0', 'Referer': 'http://211.141.74.198:8081/aiccips/', 'Host': '211.141.74.198:8081', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0' }) module.appendWebMethod("post") module.appendPostData( lambda csrf, yzm, company_key: { 'kw': company_key, '_csrf': csrf[-1], # 参数不为空由首页输出模块保证,且此参数为必选参数,故未做判断直接使用 'secode': getMd5WithString(yzm) }) module.appendCookie("cookie") module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_cookie")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_cookie")) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module)
def initPostParam(self): def postSearchListData(json, yzm): if 'textfield' not in json: return None textfield = json['textfield'] data = {"textfield": textfield, "code": yzm} return data def postDataJsonAssert(json=None): if not json: return False if 'flag' not in json or json['flag'] != '1': return False return True module = Module(self.getJson, "json中间结果") module.appendUrl( "http://gsxt.zjaic.gov.cn/search/doValidatorVerifyCode.do") module.appendHeaders({ 'Host': 'gsxt.zjaic.gov.cn', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0", 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://gsxt.zjaic.gov.cn/search/doEnGeneralQueryPage.do', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache' }) module.appendWebMethod("post") module.appendPostData(lambda company_key, yzm: { "name": company_key, "verifyCode": yzm }) # module.appendOutput(name = "post_data", type = OutputType.FUNCTION, function = postSearchListData) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, redo_module="module_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, redo_module="module_validate_code")) module.addEvent( Event(EventType.ASSERT_FAILED, assert_function=postDataJsonAssert, redo_module="module_validate_code")) self.module_manager.appendSubModule(module)
def initConfigSearchList(self): module = Module(self.visitSearchList, "搜索列表") module.appendUrl('http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html') module.appendHeaders({'Host': 'gsxt.gdgs.gov.cn', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/', 'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0'}) module.appendWebMethod("post") module.appendPostData("post_data") module.appendOutput("search_list", ".//*[@class='list']", OutputType.LIST) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_validate_code")) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_validate_code")) module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda:False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) self.module_manager.appendSubModule(module)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://gsxt.hljaic.gov.cn/searchList.jspx") module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/plain, */*; q=0.01", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def initPenaltyInfo(self, module_super): module = Module(self.visitXzcf, u"获取行政处罚信息") module.appendUrl( lambda params: 'http://211.141.74.198:8081/aiccips/pub/gsxzcfxx') module.appendHeaders( lambda csrf: { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '211.141.74.198:8081', 'X-CSRF-TOKEN': csrf[-1] }) module.appendWebMethod("post") module.appendPostData(lambda params: {'encrpripid': params[0]}) module_super.appendSubModule(module)
def checkValidateCode(self): """ 对上一个模块产生的验证码进行校验 :return: """ module = Module(self.getJson, u"校验验证码") module.module_id = "check_validate_code" module.appendUrl('http://218.95.241.36/checkCheckNo.jspx') module.appendHeaders({ "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Proxy-Connection": "keep-alive", "Host": "218.95.241.36", "Origin": "http://218.95.241.36", "Referer": "http://218.95.241.36/search.jspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm: {"checkNo": yzm}) def checkValidatecode(json=None): if not json or "{success:true}" not in json: self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码") return False else: return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def initXzcfxxInfo(self, module_super): module = Module(self.visitXzcfJson, u"获取行政处罚信息") module.appendUrl("http://218.57.139.24/pub/gsxzcfxx") module.appendHeaders(lambda com, csrf, ua:{ "Host": "218.57.139.24", "User-Agent": ua, "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Referer':'http://218.57.139.24/pub/'+com[1], 'X-CSRF-TOKEN':csrf[0], 'X-Requested-With':'XMLHttpRequest'}) module.appendWebMethod("post") module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page')) module.appendPostData(lambda com: {'encrpripid': com[2]}) module_super.appendSubModule(module, True)
def initQynbInfo(self, module_super): module = Module(None, u"设置年份") def saveNbyear(qynb_year): if qynb_year and qynb_year.strip(): self.value_dict['nb_name'] = qynb_year.strip() module.appendOutput(type=OutputType.FUNCTION, function=saveNbyear) module_super.appendSubModule(module, True) module = Module(self.visitQynb, u"获取企业年报") module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do", "Connection": "keep-alive" }) module.appendWebMethod("post") module.appendPostData( lambda pripid, qynb_year: { 'maent.nd': qynb_year.strip(), 'maent.pripid': pripid, 'method': 'ndbgDetail', 'random': str(int(time.time() * 1000)) }) module.appendOutput( "nb_zch", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[1]/text()', OutputType.LIST) #, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput( "nb_qym", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[2]/text()', OutputType.LIST) #, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5)) #粗略检测年报信息有没有注册号和企业名(没有可能会因为访问太快,没数据回来) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def getCompanyRecordInfo(self, module_super): module = Module(self.visitBaxx, u"抓取备案信息") module.module_id = "get_record_info" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entCheckInfo" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entInfo", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) module.appendWebMethod("post") module.appendPostData( lambda params_list: { "entNo": str(params_list[1]), "entType": str(params_list[2]), "regOrg": str(params_list[3]) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module_super.appendSubModule(module, True)
def assertValidateCode(self): module = Module(self.getJson, u"验证码") module.module_id = "checkValidateCode" module.appendUrl( 'http://gsxt.jxaic.gov.cn/ECPS/home/home_homeSearchYzm.pt') module.appendHeaders( lambda ua: { 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'User-Agent': ua, 'Host': 'gsxt.jxaic.gov.cn', 'Referer': 'http://gsxt.jxaic.gov.cn/ECPS/' }) module.appendWebMethod("post") module.appendPostData(lambda yzm, company_key: { "search": company_key, "yzm": yzm }) def assertVaildCode(json=None): if self.report.access_type == SeedAccessType.NON_COMPANY: self.report.access_type = SeedAccessType.ERROR return True if json and ( json.get("msg") == 'true' ) else False #json {u'msg': u'true', u'success': True} module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertVaildCode, redo_module="module_validate_code")) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module, True)
def initAnnualReportPre(self, module_super): module = Module(self.getWebHtml, u"获取年报年份列表") module.module_id = "fetch_qynb_list" module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do", "Connection": "keep-alive" }) module.appendWebMethod("post") def getQynbParamList(html): if html: rs = re.findall(r'\s+onclick="doNdbg\(\'(\d{4})\'\);"', html, re.S) return list(set(rs)) return [] module.appendPostData( lambda pripid: { 'czmk': 'czmk8', 'maent.pripid': pripid, 'method': 'qygsInfo', 'random': str(int(time.time() * 1000)) }) module.appendOutput(name="qinb_param_list", type=OutputType.FUNCTION, function=getQynbParamList, show_up=OutputParameterShowUpType.OPTIONAL) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def initConfigSearchList(self): module = Module(self.visitSearchList, u"搜索列表") module.appendUrl('http://xyjg.egs.gov.cn/ECPS_HB/searchList.jspx') module.appendHeaders( lambda ua: { "Host": "xyjg.egs.gov.cn", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://xyjg.egs.gov.cn/ECPS_HB/search.jspx" }) module.appendWebMethod("post") module.appendPostData(lambda company_key, yzm: { 'checkNo': yzm, 'entName': company_key }) module.appendOutput("search_list", "//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_home_page")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_home_page")) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.appendMiddleValueMonitor("search_list") self.module_manager.appendSubModule(module)
def checkValidateCode(self): module = Module(self.getJson, u"校验验证码") module.module_id = "check_validate_code" module.appendUrl('http://gsxt.hljaic.gov.cn/checkCheckNo.jspx') module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm: {"checkNo": yzm}) def checkValidatecode(json=None): if not json or "{success:true}" not in json: self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码") return False else: return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
# -*- coding: utf-8 -*-
def initConfigSearchList(self): module = Module(self.visitSearchList, "搜索列表") module.appendUrl( 'http://gsxt.jxaic.gov.cn/ECPS/home/home_homeSearch.pt') module.appendHeaders({ 'Host': 'gsxt.jxaic.gov.cn', 'Referer': 'http://gsxt.jxaic.gov.cn/qyxxgsAction_queryXyxx.action', 'Accept-Encoding': 'gzip, deflate', 'Cache-Control': 'max-age=0', 'Accept-Language': 'en-US,en;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36' }) module.appendWebMethod("post") module.appendPostData(lambda yzm, company_key: { "search": company_key, "yzm": yzm, }) module.appendOutput("url_list", './/*[@class="list"]/div/a/@href', OutputType.LIST) module.appendOutput("name_list", './/*[@class="list"]/div/a/font/text()', OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.appendOutput("name_invalid_list", ".//*[@id='div0']/div[1]/text()", OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput("status_invalid_list", ".//*[@id='div0']/div[2]/span[2]/text()", OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput("page_nos", ".//*[@id='form1']//div//td[@align]/text()", show_up=OutputParameterShowUpType.OPTIONAL) # Todo 搜索列表页翻页 def page_range(page_nos): if not page_nos: return None page_str = page_nos.strip() page_str = page_str[3:] page_str = page_str[page_str.find('共') + 3:page_str.find('页')] return range(2, int(page_str) + 1) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=2, redo_module="module_validate_code")) def assert_func(url_list, name_invalid_list, html): if not url_list and name_invalid_list: self.report.access_type = SeedAccessType.NO_VALID_COMPANY self.holder.logging.info("无有效公司列表!") return False if '无数据' in html: self.report.access_type = SeedAccessType.NON_COMPANY self.holder.logging.info("无此公司!") return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=assert_func)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_validate_code")) self.module_manager.appendSubModule(module)
# -*- coding: utf-8 -*-
def get_search_list(self): module = Module(self.visitSearchList, u"搜索列表") module.appendUrl('http://218.57.139.24/pub/indsearch') module.appendHeaders(lambda ua : { "Host": "218.57.139.24", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", 'Referer':'http://218.57.139.24/' }) module.appendWebMethod("post") def assertCsrf(csrf): return True if isinstance(csrf, list) and len(csrf) > 0 else False module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertCsrf, redo_module="home_page")) def assertVaildCode(yzm, html=None): if not html: return False # print 'type(html) ', type(html) # if u'计算错误' in html: # print u'cout << 计算错误' # elif u'验证码超时,请重新计算' in html: # print u'cout << 验证码超时,请重新计算' # else: # print 'yzm True, ', yzm return False if u'计算错误' in html or u'验证码超时,请重新计算' in html else True def md5(str): m = hashlib.md5() m.update(str) return m.hexdigest() module.appendPostData(lambda company_key, yzm, csrf: {'kw': company_key, '_csrf': csrf[0], 'secode':md5(str(yzm))}) module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertVaildCode, redo_module="home_page")) def assertProxyStatus(html): if u"每天最多可搜索" in html: #print u'切换代理' download = DownLoader('shandong') download.changeProxy() return False return True module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=20, assert_function=assertProxyStatus, redo_module="home_page")) module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=5, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.appendOutput("search_list_xpath", '//ul/li[@class="font16"]/a', OutputType.LIST) def getSearchList(search_list_xpath=None): ret_args = [] if not search_list_xpath: return [] for xpath_a in search_list_xpath: com_name = xpath_a.xpath('./text()') com_href = xpath_a.xpath('./@href') # print com_name, com_href if com_name and com_href: args = com_href[0].split('/') if len(args) == 3: #URL的参数个数变? com_num = args[1].strip() encrpripid = args[2].strip() if com_num and encrpripid: ret_args.append((com_name[0].strip(), com_href[0].strip(), encrpripid, com_num)) return ret_args module.appendOutput(name="search_list", type=OutputType.FUNCTION, function=getSearchList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module='home_page')) self.module_manager.appendSubModule(module, True)
def initConfigSearchList(self): module = Module(self.visitSearchList, u"搜索列表") module.module_id = "get_search_list" module.appendUrl( "http://gsxt.scaic.gov.cn/ztxy.do?method=list&djjg=&random=" + str(int(time.time()) * 1000)) module.appendWebMethod("post") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index", "Connection": "keep-alive" }) def getPostData(yzm, company_key): if isinstance(company_key, unicode): company_key = company_key.encode('gb2312') else: charcode = chardet.detect(company_key).get('encoding') if charcode: company_key = company_key.decode(charcode).encode('gb2312') rs_dict = { 'currentPageNo': '1', 'yzm': yzm, 'maent.entname': company_key, "pName": u'请输入营业执照注册号或统一社会信用代码'.encode('gb2312') } return rs_dict module.appendPostData(getPostData) module.appendOutput(name="yzm_flag", type=OutputType.FUNCTION, function=self.getYzmFlag, show_up=OutputParameterShowUpType.OPTIONAL) def assertYzmFlag(yzm_flag=None): return True if yzm_flag == 'yes' else False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertYzmFlag, redo_module='check_validatecode')) def assertNoCompany(yzm_flag=None): return False if yzm_flag == 'yes' and self.report.access_type == SeedAccessType.NON_COMPANY else True module.addEvent( Event( EventType.ASSERT_FAILED, retry_times=6, assert_function=assertNoCompany, redo_module="check_validatecode")) #获取公司搜索列表(公司名和onclick事件参数) module.appendOutput( "search_list_xpath", './/ul/li[@class="font16"]/a', OutputType.LIST) #,show_up=OutputParameterShowUpType.OPTIONAL) module.appendOutput(name="tag_alist", type=OutputType.FUNCTION, function=self.getSearchList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="check_validatecode")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module='check_validatecode')) module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module, True)