def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"获取验证码") module.module_id = "init_validate_code" module.appendUrl( "http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0&id=" + str(random.random())) module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.addSleep(Sleep(3)) module.appendEncoding("utf-8") def checkValidatecode(yzm): if not yzm: self.holder.logging.warning(u"获取验证码失败") return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode)) self.module_manager.appendSubModule(module, True)
def initToken(self): module = Module(self.getWebHtml, u"令牌获取") module.module_id = "module_token" module.appendUrl('https://www.sgs.gov.cn/notice/search/popup_captcha') module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'https://www.sgs.gov.cn/notice/home' }) module.appendEncoding("utf-8") def getToken(html): if not html or '\"session.token\": \"' not in html: self.holder.logging.error(u'获取session.token失败!') return None token = re.search(r'\"session\.token\": \"(.*?)\"', html).group(1) if not token: self.holder.logging.error(u'提取token失败!') self.holder.logging.info('token: %s' % token) return token module.appendOutput(name="token", type=OutputType.FUNCTION, function=getToken) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) self.module_manager.appendSubModule(module)
def initCookie(self): module = Module(self.getWebHtml, u"获取cookie") module.module_id = "module_cookie" module.appendUrl('http://211.141.74.198:8081/aiccips/') module.appendHeaders( lambda ua: { 'Connection': 'keep-alive', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Cache-Control': 'max-age=0', 'Referer': 'http://211.141.74.198:8081/aiccips/', 'Host': '211.141.74.198:8081', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': ua }) def getCookie(html): pattern = re.compile(r'\}\(([\s\S]*?\{\})\)\)', re.S) result = pattern.search(html).group(1) params = result.split(',') if len(params) != 6: raise Exception("cookie获取失败!") strstr = 'var document = {};var window = {};document[\'cookie\'] = "";window[\'location\'] ={}; \ window[\'location\'][\'reload\'] = function(){};eval(function (p, a, c, k, e, r) { \ e = function(c) { \ return c.toString(a) \ }; \ if (!\'\'.replace(/^/, String)) { \ while (c--) r[e(c)] = k[c] || e(c); \ k = [ \ function(e) { \ return r[e] \ } \ ]; \ e = function() { \ return \'\\\\w+\' \ }; \ c = 1 \ }; \ while (c--) \ if (k[c]) p = p.replace(new RegExp(\'\\\\b\' + e(c) + \'\\\\b\', \'g\'), k[c]); \ return p \ }(' + params[0] + ',' + params[1] + ',' + params[2] + ',' + params[ 3] + ',' + params[4] + ',' + params[5] + ')); \ challenge();var a = document[\'cookie\'];' with PyV8.JSContext() as se: se.eval(strstr) a = se.locals.a cookie = a.split('=')[1].split(';')[0] cookie_temp1 = dict({'ROBOTCOOKIEID': cookie}) return cookie_temp1 module.appendOutput(name="cookie", type=OutputType.FUNCTION, function=getCookie) module.appendMiddleValueMonitor("cookie") module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module, True)
def initShareHolderDetail(self, module_super): iterator = Iterator("xh_pripid", "xh_prid") module = Module(None, "进入股东详情", iterator) module.module_id = "fetch_gdxq_info" module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东翻页信息") sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do') sub_module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do", "Connection": "keep-alive" }) sub_module.appendWebMethod("post") sub_module.appendPostData( lambda xh_prid: { 'maent.pripid': xh_prid[1], 'maent.entbigtype': xh_prid[0], 'random': str(int(time.time() * 1000)), 'method': 'tzrCzxxDetial', 'random': str(int(time.time() * 1000)) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module.appendSubModule(sub_module, True)
def initAnnualReportPre(self, module_super): module = Module(self.getWebHtml, u"获取年报年份列表") module.module_id = "fetch_qynb_list" module.appendUrl( lambda qyid, company_zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/qygs_ViewQynb.pt?qyid=%s&zch=%s&qylx=%s&num=0" % (qyid, company_zch, qylx)) module.appendHeaders( lambda ua, qylx, qyid, company_zch: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.jxaic.gov.cn", "Connection": "keep-alive" }) def getRightUrl(html=None): if not html: return [] rs = re.findall(r'<a\s+href="(.*?\?.*?nbnd=\d{4}.*?)"', html, re.S) rs = list(set(rs)) return ['http://gsxt.jxaic.gov.cn' + x for x in rs] module.appendOutput(name="qynb_param_list", type=OutputType.FUNCTION, function=getRightUrl, show_up=OutputParameterShowUpType.OPTIONAL) module_super.appendSubModule(module, True)
def initArchiveInfo(self, module_super): module = Module(self.visitBaxx, u"获取备案信息") module.module_id = "fetch_baxx_info" module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do") module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index", "Connection": "keep-alive" }) module.appendWebMethod("post") module.appendPostData( lambda pripid, entbigtype: { 'czmk': 'czmk2', 'maent.entbigtype': entbigtype, 'maent.pripid': pripid, 'method': 'baInfo', 'random': str(int(time.time() * 1000)) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def initConfigHomePage(self): module = Module(self.visitHomePage, u"首页") module.module_id = "module_home_page" module.appendUrl("http://211.141.74.198:8081/aiccips/") module.appendHeaders({ 'Connection': 'keep-alive', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Cache-Control': 'max-age=0', 'Referer': 'http://211.141.74.198:8081/aiccips/', 'Host': '211.141.74.198:8081', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0' }) module.appendCookie("cookie") module.appendOutput("csrf", ".//input[@name='_csrf']/@value", OutputType.LIST) module.appendMiddleValueMonitor("csrf") module.addSleep(Sleep(3)) self.module_manager.appendSubModule(module, True)
def initAnnualReportPre(self, module_super): module = Module(self.getWebHtml, u"获取年报年份列表") module.module_id = "fetch_qynb_list" module.appendUrl(lambda com :"http://218.57.139.24/pub/qygsdetail/%s/%s"%(com[3], com[2])) module.appendHeaders(lambda ua, com: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "218.57.139.24", "Referer": "http://218.57.139.24/pub/"+com[1], "Connection": "keep-alive"}) module.appendOutput("qynb_search_parms", '//table/tr/td/a', OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL) def getQynbParms(qynb_search_parms=None): #print 'qynb_search_parms :', qynb_search_parms if not qynb_search_parms: return [] rt_list = [] for tag_a in qynb_search_parms: href = tag_a.xpath('./@href') nb_name = tag_a.xpath('./text()') # print href, '---->' , nb_name if href and nb_name: nb_year = re.findall(r'.*?(\d{4}).+?', nb_name[0].strip(), re.S) # print 'nb_year, ', nb_year if nb_year: rt_list.append((nb_year[0].strip(), href[0].strip())) # print 'rt_lsit ', rt_list return rt_list module.appendOutput(name="qinb_param_list", type=OutputType.FUNCTION, function=getQynbParms, show_up=OutputParameterShowUpType.OPTIONAL) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"获取验证码") module.module_id = "init_validate_code" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/verify.html?random=" + str(random.random())) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Accept': 'image/webp,*/*;q=0.8', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.addSleep(Sleep(3)) module.appendEncoding("utf-8") def checkValidatecode(yzm): if not yzm: return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode)) self.module_manager.appendSubModule(module, True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u'获取验证码图片') module.module_id = "check_validatecode" module.appendUrl( "http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm&dt=" + str(int(time.time())) + "&random=" + str(int(time.time()))) module.appendHeaders( lambda ua: { "Host": "gsxt.scaic.gov.cn", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Connection": "keep-alive", 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': 1 }) #对验证码进行简单的断言 def assertYzm(yzm=None): print 'Yzm , ', yzm if isinstance(yzm, int): return True return True if yzm else False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=assertYzm, redo_module="check_validatecode")) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.addSleep(Sleep(2)) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, textfield: { "code": yzm, "textfield": textfield.replace(r"\n", "") }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def initAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annal") module = Module(None, u"获取年报信息", iterator) module.module_id = "get_annals_info" module_super.appendSubModule(module, True) self.initAnnalsDetails(module)
def checkValidateCode(self): module = Module(self.getJson, u"检验验证码") module.module_id = "check_validate_code" module.appendUrl( "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html" ) module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Origin': 'http://www.nmgs.gov.cn:7001', 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.nmgs.gov.cn:7001/aiccips/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendWebMethod("post") module.addSleep(Sleep(3)) module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "code": yzm, "textfield": company_key }) def checkValidatecode(web=None): if not web: return False else: pattern = re.compile(r'\"([\s\S]*?)\"') flags = pattern.findall(str(web.body)) if (len(flags) != 4 or flags[2] != 'textfield') or ( flags[0] == 'flag' and flags[1] != str(1)): self.holder.logging.warning(u"验证码校验失败!") return False else: self.value_dict["textfield"] = flags[3].decode( 'raw_unicode_escape') return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): """ 抓取公司列表 :output: url_list, name_list, search_list :return: """ module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://218.95.241.36/searchList.jspx") module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Origin": "http://218.95.241.36", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://218.95.241.36/search.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, "验证码") module.module_id = "module_validate_code" module.appendUrl("http://gsxt.gdgs.gov.cn/aiccips/verify.html?random=" + str(random.random())) module.appendHeaders( {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0', 'Host': 'gsxt.gdgs.gov.cn', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/'}) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) self.module_manager.appendSubModule(module)
def getStockholderInfo(self, module_super): """ 抓取翻页的股东信息 :param module_super: :return: """ module = Module(self.visitGdxx, u"抓取股东信息") module.module_id = "get_stockholder_info" module.appendUrl( lambda pno, company_id: "http://218.95.241.36/QueryInvList.jspx?pno=%s&mainId=%s" % (pno, company_id)) module.appendHeaders( lambda company_id: { 'Host': '218.95.241.36', 'Proxy-Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': 'http://218.95.241.36/businessPublicity.jspx?id=' + str( company_id), 'Accept-Encoding': 'gzip, deflate', "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getGdxqList(html): query_dict = dict() try: tree = etree.HTML(html) query_dict["gdxq_list"] = tree.xpath( ".//*[@class='detailsList']/tr/td/a/@onclick") except Exception as e: self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e) query_dict = dict() return query_dict module.appendOutput(type=OutputType.FUNCTION, function=getGdxqList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module_super.appendSubModule(module, True)
def fetchStockholderInfo(self, module_super): """ 遍历股东信息分页, 并再次迭代股东详情列表 :param module_super: :return: """ iterator = Iterator(seeds="gdxx_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历股东翻页") module.module_id = "gdxx_pages" module_super.appendSubModule(module) self.getStockholderInfo(module) self.fetchStockholderDetail(module)
def getAnnalsInfo(self, module_super): """ 遍历年报列表 :param module_super: :return: """ iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://218.95.241.36' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def getStockholderInfo(self, module_super): """ 抓取翻页的股东信息 :param module_super: :return: """ module = Module(self.visitGdxx, u"抓取股东信息") module.module_id = "get_stockholder_info" module.appendUrl( lambda pno, company_id: "http://gsxt.hljaic.gov.cn/QueryInvList.jspx?pno=%s&mainId=%s" % (pno, company_id)) module.appendHeaders( lambda company_id: { "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", 'Referer': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str( company_id), }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getGdxqList(html): query_dict = dict() try: tree = etree.HTML(html) query_dict["gdxq_list"] = tree.xpath( ".//*[@class='detailsList']/tr/td/a/@onclick") except Exception as e: self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e) query_dict = dict() return query_dict module.appendOutput(type=OutputType.FUNCTION, function=getGdxqList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module_super.appendSubModule(module, True)
def initHomePage(self): module = Module(self.visitHomePage, u"访问首页") module.module_id = "home_page" module.appendUrl('http://218.57.139.24') module.appendHeaders(lambda ua: { "Host": '218.57.139.24', "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.8", "Accept-Encoding": "gzip, deflate, sdch", "Connection": "keep-alive", 'User-Agent': ua}) module.addSleep(Sleep(2)) module.appendOutput("csrf", '//form[@id="searchform"]/input[@name="_csrf"]/@value', OutputType.LIST) module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) self.module_manager.appendSubModule(module, True)
def getCmpnySereachList(self): module = Module(self.visitSearchList, u"抓取公司列表") module.module_id = "get_search_list" module.appendUrl("http://gsxt.hljaic.gov.cn/searchList.jspx") module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/plain, */*; q=0.01", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/x-www-form-urlencoded", "Referer": "http://gsxt.hljaic.gov.cn/search.jspx" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm, company_key: { "checkNo": yzm, "entName": company_key }) module.addSleep(Sleep(3)) module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href", OutputType.LIST) module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()", OutputType.LIST) module.appendOutput( name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list)) module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda: False if self.report.access_type == SeedAccessType.NON_COMPANY else True)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) module.addEvent( Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module)
def getAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://gsxt.hljaic.gov.cn' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def initConfigCompanyInfoPre(self, module_super): module = Module(None, u"抓取公司前的预处理") module.module_id = "fetch_company_info" def setComParms(tag_a): query_ = { 'company_name': tag_a[0], 'entbigtype': tag_a[1], 'pripid': tag_a[2] } return query_ module.appendOutput(type=OutputType.FUNCTION, function=setComParms, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module_super.appendSubModule(module, True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"验证码") module.module_id = "module_validate_code" module.appendUrl( 'http://gsxt.jxaic.gov.cn/ECPS/common/common_getJjYzmImg.pt?yzmName=searchYzm&imgWidth=180&t=' + str(random.random())) module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn', 'Referer': 'http://gsxt.jxaic.gov.cn/' }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) self.module_manager.appendSubModule(module, True)
def checkValidateCode(self): """ 对上一个模块产生的验证码进行校验 :return: """ module = Module(self.getJson, u"校验验证码") module.module_id = "check_validate_code" module.appendUrl('http://218.95.241.36/checkCheckNo.jspx') module.appendHeaders({ "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Proxy-Connection": "keep-alive", "Host": "218.95.241.36", "Origin": "http://218.95.241.36", "Referer": "http://218.95.241.36/search.jspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "X-Requested-With": "XMLHttpRequest" }) module.appendWebMethod("post") module.appendEncoding("utf-8") module.appendPostData(lambda yzm: {"checkNo": yzm}) def checkValidatecode(json=None): if not json or "{success:true}" not in json: self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码") return False else: return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=100, assert_function=checkValidatecode, redo_module="init_validate_code")) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="init_validate_code")) self.module_manager.appendSubModule(module, True)
def initConfigYzm(self): module = Module(self.visitValidateCode, u"验证码") module.module_id = "module_yzm" module.appendUrl( lambda radom_val: "http://xyjg.egs.gov.cn/ECPS_HB/validateCode.jspx?type=1&_=%s" % (str(int(random.random())))) module.appendHeaders( lambda ua: { "Host": "xyjg.egs.gov.cn", "User-Agent": ua, "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://xyjg.egs.gov.cn/ECPS_HB/search.jspx" }) # Todo 不指定redo module,是不是重试自己?? module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=10000)) # redo_module self.module_manager.appendSubModule(module)
def initAnnalsDetails(self, module_super): module = Module(self.visitQynb, u"获取年报详情") module.module_id = "get_annals_detail" def prepare(annal): query_dict = {} if annal and len(annal) >= 2: query_dict["annals_url"] = str(annal[0]) name = str(annal[1].strip('\r\n\t')) query_dict["nb_name"] = filter(str.isdigit, name) return query_dict module.appendInput(InputType.FUNCTION, prepare) def getUrl(annals_url): if "http" in annals_url: return annals_url else: return u'http://211.141.74.198:8081/' + annals_url module.appendUrl(getUrl) module.appendHeaders({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '211.141.74.198:8081' }) module.appendWebMethod("get") module.appendCookie("cookie") module.addSleep(Sleep(3)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_list")) module_super.appendSubModule(module)
def initBranchInfoPage(self, module_super): iterator = Iterator("fzjg_page_range", "pno") module = Module(None, u"第八步_获取分支机构_开始翻页数据", iterator) module.module_id = "fzjg_pages" module_super.appendSubModule(module, True) sub_module = Module(self.visitFzjg, u"第八步_获取分支机构翻页数据") sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryChildList.jspx?mainId=%s&pno=%s" %(id, pno)) sub_module.appendHeaders(lambda company_url: { 'Host': "aic.hainan.gov.cn:1888", 'Connection': 'keep-alive', 'Accept': '*/*', 'Referer': "http://aic.hainan.gov.cn:1888" + company_url, 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8' } ) module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initYzm(self): module = Module(self.visitValidateCode, u"第一步_获取验证码") module.module_id = "hn_yzm_pic" module.appendUrl("http://aic.hainan.gov.cn:1888/validateCode.jspx?type=0&id=%s" % random.random()) module.appendHeaders( { "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.8", "Cache-Control":"max-age=0", "Connection":"keep-alive", "Host":"aic.hainan.gov.cn:1888", "Referer":"http://aic.hainan.gov.cn:1888/search.jspx", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } ) module.appendWebMethod("get") module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=2)) self.module_manager.appendSubModule(module, supportDefaultEvent=True)
def initConfigValidateCode(self): module = Module(self.visitValidateCode, u"验证码") module.module_id = "module_validate_code" module.appendUrl( "http://gsxt.zjaic.gov.cn/common/captcha/doReadKaptcha.do") module.appendHeaders({ "Host": "gsxt.zjaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0", "Accept": "image/png,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.zjaic.gov.cn/search/doEnGeneralQueryPage.do" }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100000)) self.module_manager.appendSubModule(module)