def initNbiter(self, module_super): iterator = Iterator("qynb_list", "nianb") module = Module(None, u"获取公司年报", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"获取年报详情") def prepare(nianb): mv_dict = dict() mv_dict['nb_url'] = nianb[0] mv_dict['nb_name'] = nianb[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepare) sub_module.appendUrl('nb_url') sub_module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }) module.appendSubModule(sub_module)
def initAnnualReport(self, module_super): iterator = Iterator(seeds="nb_list",param_name="nb") module=Module(None, u"遍历企业年报列表获取Url",iterator) module_super.appendSubModule(module) # self.initAnnualReportInfo(module) # # def initAnnualReportInfo(self, module_super): # module = Module(self.visitQynb, u"获取企业年报详细信息") sub_module = Module(self.visitQynb, u"获取企业年报详细信息") def annual_convert(nb): con_dict = dict() con_dict["nb_url"] = "http://aic.hainan.gov.cn:1888%s" % ''.join(nb.xpath("@href")) con_dict["nb_name"] = ''.join(nb.xpath("text()")).replace(u"年度报告", "") return con_dict sub_module.appendInput(InputType.FUNCTION, input_value=annual_convert) sub_module.appendUrl("nb_url") sub_module.appendHeaders( lambda company_url: { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "aic.hainan.gov.cn:1888", "Referer": company_url, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } ) sub_module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initChangeInfoPage(self, module_super): iterator = Iterator("bgxx_pages", "page_no") module = Module(None, u"进入变更信息翻页", iterator) module_super.appendSubModule(module) sub_module = Module(self.visitBgxx, u"获取变更翻页信息") sub_module.appendUrl( lambda qyid: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s" % qyid) sub_module.appendWebMethod("post") sub_module.appendPostData(lambda page_no: { 'page': page_no, 'limit': 5, 'mark': 0 }) sub_module.appendHeaders( lambda ua: { 'Host': 'gsxt.jxaic.gov.cn', 'Connection': 'keep-alive', 'User-Agent': ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendSubModule(sub_module, True)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "info") module = Module(None, "获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyPrepare(module) self.initRouter(module)
def initShareHolderDetail(self, module_super): iterator = Iterator("xh_pripid", "xh_prid") module = Module(None, "进入股东详情", iterator) module.module_id = "fetch_gdxq_info" module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东翻页信息") sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do') sub_module.appendHeaders( lambda ua: { "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Host": "gsxt.scaic.gov.cn", "Origin": "http://gsxt.scaic.gov.cn", "Referer": "http://gsxt.scaic.gov.cn/ztxy.do", "Connection": "keep-alive" }) sub_module.appendWebMethod("post") sub_module.appendPostData( lambda xh_prid: { 'maent.pripid': xh_prid[1], 'maent.entbigtype': xh_prid[0], 'random': str(int(time.time() * 1000)), 'method': 'tzrCzxxDetial', 'random': str(int(time.time() * 1000)) }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.addSleep(Sleep(2)) module.appendSubModule(sub_module, True)
def initChangeInfoPage(self, module_super): iterator = Iterator("bgxx_pages", "page_no") module = Module(None, "进入变更信息翻页", iterator) module_super.appendSubModule(module) sub_module = Module(self.visitBgxx, "获取变更翻页信息") sub_module.appendUrl( lambda qyid: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s" % qyid) sub_module.appendWebMethod("post") sub_module.appendPostData(lambda page_no: { 'page': page_no, 'limit': 5, 'mark': 0 }) sub_module.appendHeaders({ 'Host': 'gsxt.jxaic.gov.cn', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendSubModule(sub_module)
def initAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annal") module = Module(None, u"获取年报信息", iterator) module.module_id = "get_annals_info" module_super.appendSubModule(module, True) self.initAnnalsDetails(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "info") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyInfoPrepare(module) self.initConfigBaseInfo(module) self.initGdxq(module) self.initNianBao(module) self.initNbiter(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("tag_alist", "tag_a") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initConfigCompanyInfoPre(module) self.initConfigBaseInfo(module) self.initArchiveInfo(module) self.initPenaltyInfo(module) self.initShareHolderDetail(module) self.initAnnualReportPre(module) self.initAnnualReport(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyInfo(module) self.initBaxxInfo(module) self.initFzhgInfo(module) self.initXzcfxxInfo(module) self.initGdxqInfoPrepare(module) self.initAnnualReportPre(module) self.initAnnualReport(module) self.initResultCollect(module)
def fetchStockholderInfo(self, module_super): """ 遍历股东信息分页, 并再次迭代股东详情列表 :param module_super: :return: """ iterator = Iterator(seeds="gdxx_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历股东翻页") module.module_id = "gdxx_pages" module_super.appendSubModule(module) self.getStockholderInfo(module) self.fetchStockholderDetail(module)
def getAnnalsInfo(self, module_super): """ 遍历年报列表 :param module_super: :return: """ iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://218.95.241.36' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def fetchCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"处理公司列表", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.prepareCompnanyParms(module) self.getCompanyInfo(module) self.fetchCmpnyGdxq(module) self.getCompanyRecordInfo(module) self.getCompanyPunishInfo(module) # self.getAnnalsList(module) # self.getAnnalsInfo(module) self.initResultCollect(module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"获取公司信息", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.initConfigBaseInfo(module) self.initConfigShareHolderInfo(module) self.initConfigChangeInfo(module) self.initArchiveInfo(module) self.initBranchInfo(module) self.initPenaltyInfo(module) self.initAnnalsList(module) self.initAnnalsInfo(module) self.initResultCollect(module)
def getAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://gsxt.hljaic.gov.cn' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def initShareholderInfoDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, u"开始获取股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情信息") def getGdxqUrl(gdxx_rcd): for key in gdxx_rcd: if 'onclick' in gdxx_rcd[key]: onclick_dict = eval(gdxx_rcd[key]) if isinstance(gdxx_rcd[key], basestring) else gdxx_rcd[key] onclick = onclick_dict["onclick"] xq_link = onclick[onclick.find('(')+1:onclick.find(')')].replace("'", "") xq_url = "http://aic.hainan.gov.cn:1888" + xq_link return xq_url return None sub_module.appendUrl(getGdxqUrl) sub_module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initConfigCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, "获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initCompanyInfoPrepare(module) self.initConfigBaseInfo(module) self.initTopInfo(module) self.initConfigShareHolderInfo(module) self.initShareHolderInfoPage(module) self.initShareHolderDetail(module) self.initConfigChangeInfo(module) self.initChangeInfoPage(module) self.initArchiveInfo(module) self.initResultCollect(module)
def initShareHolderDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, "进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, "获取股东详情信息") sub_module.appendUrl(self.getGdxqUrl) sub_module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn', 'Cache-Control': 'max-age=0' }) module.appendSubModule(sub_module)
def initGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") sub_module.appendUrl("gdxq") module.appendHeaders( lambda gdxq: { 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': gdxq }) module.appendSubModule(sub_module)
def initGdxqInfoPrepare(self, module_super): iterator = Iterator("recid_list", "rid") module = Module(None, u"进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情") # def pri_c(rid, com): # print 'xxxxxx===>>>', rid # sub_module.appendOutput(type=OutputType.FUNCTION, function=pri_c) sub_module.appendUrl(lambda rid, com: 'http://218.57.139.24/pub/gsnzczxxdetail/%s/%s'%(com[2], rid.strip())) sub_module.appendHeaders(lambda ua, com: { "Host": "218.57.139.24", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", 'Referer':'http://218.57.139.24/pub/'+com[1],}) module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page')) module.appendSubModule(sub_module, True)
def initShareHolderDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, u"进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情信息") sub_module.appendUrl(self.getGdxqUrl) sub_module.appendHeaders( lambda ua: { 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': ua, 'Host': 'gsxt.jxaic.gov.cn', 'Cache-Control': 'max-age=0' }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.appendSubModule(sub_module, True)
def fetchCmpnyGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") # TODO: 添加try exception def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def initBranchInfoPage(self, module_super): iterator = Iterator("fzjg_page_range", "pno") module = Module(None, u"第八步_获取分支机构_开始翻页数据", iterator) module.module_id = "fzjg_pages" module_super.appendSubModule(module, True) sub_module = Module(self.visitFzjg, u"第八步_获取分支机构翻页数据") sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryChildList.jspx?mainId=%s&pno=%s" %(id, pno)) sub_module.appendHeaders(lambda company_url: { 'Host': "aic.hainan.gov.cn:1888", 'Connection': 'keep-alive', 'Accept': '*/*', 'Referer': "http://aic.hainan.gov.cn:1888" + company_url, 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8' } ) module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def fetchCompanyInfo(self): """ 创建模块分支, 迭代公司列表 :return: """ iterator = Iterator("search_list", "com") module = Module(None, u"处理公司列表", iterator) # 保存验证码图片 module.appendExtraFunction(self.yzmSave) self.module_manager.appendSubModule(module, True) self.prepareCompnanyParms(module) self.getCompanyInfo(module) self.fetchStockholderDetail(module) self.fetchStockholderInfo(module) self.fetchChangeInfo(module) self.fetchRecordInfo(module) self.fetchBranchInfo(module) self.fetchPunishInfo(module) self.getAnnalsList(module) self.getAnnalsInfo(module) self.initResultCollect(module)
def initShareholderInfoPage(self, module_super): iterator = Iterator("gdxx_page_range", "pno") module = Module(None, u"第五步_获取股东信息_开始翻页数据", iterator) module.module_id = "gdxx_pages" module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxx, u"第五步_获取股东信息翻页数据") sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryInvList.jspx?mainId=%s&pno=%s" % (id, pno)) sub_module.appendHeaders(lambda company_url: { 'Host': "aic.hainan.gov.cn:1888", 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': "http://aic.hainan.gov.cn:1888" + company_url, 'Accept-Encoding': 'gzip, deflate', "Accept-Language": "zh-CN,zh;q=0.8" } ) module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { 'Host': '218.95.241.36', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*', 'Referer': 'http://218.95.241.36/businessPublicity.jspx?id=' + str( company_id), 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate, sdch', "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return "http://218.95.241.36" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", 'Referer': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str( company_id), }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def initCompanyInfo(self): iterator = Iterator("search_list", "com") module = Module(None, u"获取公司信息", iterator) self.module_manager.appendSubModule(module, True) self.initUrlParams(module) self.initBasicInfo(module) self.initShareholderInfoPage(module) self.initShareholderInfoDetail(module) self.initChangeInfoPage(module) self.initArchiveInfoPage(module) # self.initArchiveInfoTwoPage(module) self.initBranchInfoPage(module) self.initAnnualReportList(module) self.initAnnualReport(module) # self.initAnnualReportInfo(module) self.initResultCollect(module)