def initShareHolderInfoPage(self, module_super): iterator = Iterator("gdxx_pages", "page_no") module = Module(None, u"进入股东翻页", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxx, u"获取股东翻页信息") sub_module.appendUrl( lambda qyid: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxGdxx.pt?qyid=%s" % qyid) sub_module.appendWebMethod("post") sub_module.appendPostData(lambda page_no: { 'page': page_no, 'limit': 5, 'mark': 0 }) # mark可取0,-1,1 取值视点击的顺序定 sub_module.appendHeaders( lambda ua: { 'Host': 'gsxt.jxaic.gov.cn', 'Connection': 'keep-alive', 'User-Agent': ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) module.appendSubModule(sub_module, True)
def getAnnalsInfo(self, module_super): """ 遍历年报列表 :param module_super: :return: """ iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://218.95.241.36' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def getAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://gsxt.hljaic.gov.cn' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def initShareholderInfoDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, u"开始获取股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情信息") def getGdxqUrl(gdxx_rcd): for key in gdxx_rcd: if 'onclick' in gdxx_rcd[key]: onclick_dict = eval(gdxx_rcd[key]) if isinstance(gdxx_rcd[key], basestring) else gdxx_rcd[key] onclick = onclick_dict["onclick"] xq_link = onclick[onclick.find('(')+1:onclick.find(')')].replace("'", "") xq_url = "http://aic.hainan.gov.cn:1888" + xq_link return xq_url return None sub_module.appendUrl(getGdxqUrl) sub_module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") sub_module.appendUrl("gdxq") module.appendHeaders( lambda gdxq: { 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': gdxq }) module.appendSubModule(sub_module)
def initShareHolderDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, "进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, "获取股东详情信息") sub_module.appendUrl(self.getGdxqUrl) sub_module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn', 'Cache-Control': 'max-age=0' }) module.appendSubModule(sub_module)
def initGdxqInfoPrepare(self, module_super): iterator = Iterator("recid_list", "rid") module = Module(None, u"进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情") # def pri_c(rid, com): # print 'xxxxxx===>>>', rid # sub_module.appendOutput(type=OutputType.FUNCTION, function=pri_c) sub_module.appendUrl(lambda rid, com: 'http://218.57.139.24/pub/gsnzczxxdetail/%s/%s'%(com[2], rid.strip())) sub_module.appendHeaders(lambda ua, com: { "Host": "218.57.139.24", "User-Agent": ua, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", 'Referer':'http://218.57.139.24/pub/'+com[1],}) module.addSleep(Sleep(2)) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page')) module.appendSubModule(sub_module, True)
def initShareHolderDetail(self, module_super): iterator = Iterator("gdxx_list", "gdxx_rcd") module = Module(None, u"进入股东详情", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxq, u"获取股东详情信息") sub_module.appendUrl(self.getGdxqUrl) sub_module.appendHeaders( lambda ua: { 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent': ua, 'Host': 'gsxt.jxaic.gov.cn', 'Cache-Control': 'max-age=0' }) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5)) module.appendSubModule(sub_module, True)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def fetchCmpnyGdxq(self, module_super): iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") # TODO: 添加try exception def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ 'Host': 'www.nmgs.gov.cn:7001', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Origin': 'http://www.nmgs.gov.cn:7001', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4' }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module.appendSubModule(sub_module)
def initBranchInfoPage(self, module_super): iterator = Iterator("fzjg_page_range", "pno") module = Module(None, u"第八步_获取分支机构_开始翻页数据", iterator) module.module_id = "fzjg_pages" module_super.appendSubModule(module, True) sub_module = Module(self.visitFzjg, u"第八步_获取分支机构翻页数据") sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryChildList.jspx?mainId=%s&pno=%s" %(id, pno)) sub_module.appendHeaders(lambda company_url: { 'Host': "aic.hainan.gov.cn:1888", 'Connection': 'keep-alive', 'Accept': '*/*', 'Referer': "http://aic.hainan.gov.cn:1888" + company_url, 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8' } ) module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initRouter(self, module_super): module = Module(None, "广东公司适配", router=Router()) def source_prepare(company_url): source = '' if 'gsxt.gzaic.gov.cn' in company_url: source = u"企业信用网" elif '/GSpublicity/' in company_url: source = u"企业信息网" elif 'szcredit' in company_url: source = u"深圳信用网" else: source = u"企业信用网" self.page_dict['source'] = source return {"source": source} module.appendInput(InputType.FUNCTION, source_prepare) qyxx = CrawlerGdQyxx(self.pinyin, self) module.appendSubModule(qyxx.module_manager.getFirstModule()) qyxy = CrawlerGdQyxy(self.pinyin, self) module.appendSubModule(qyxy.module_manager.getFirstModule()) szxy = CrawlerSzxy(self.pinyin, self) module.appendSubModule(szxy.module_manager.getFirstModule()) def shenzhenAssert(source): if not source or source == u"深圳信用网": self.report.access_type = SeedAccessType.NO_TARGET_SOURCE return False return True module.addEvent(Event(event_type=EventType.ASSERT_FAILED, retry_times=0, assert_function=shenzhenAssert)) module_super.appendSubModule(module, True)
def initShareholderInfoPage(self, module_super): iterator = Iterator("gdxx_page_range", "pno") module = Module(None, u"第五步_获取股东信息_开始翻页数据", iterator) module.module_id = "gdxx_pages" module_super.appendSubModule(module, True) sub_module = Module(self.visitGdxx, u"第五步_获取股东信息翻页数据") sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryInvList.jspx?mainId=%s&pno=%s" % (id, pno)) sub_module.appendHeaders(lambda company_url: { 'Host': "aic.hainan.gov.cn:1888", 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer': "http://aic.hainan.gov.cn:1888" + company_url, 'Accept-Encoding': 'gzip, deflate', "Accept-Language": "zh-CN,zh;q=0.8" } ) module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { 'Host': '218.95.241.36', 'Proxy-Connection': 'keep-alive', 'Accept': '*/*', 'Referer': 'http://218.95.241.36/businessPublicity.jspx?id=' + str( company_id), 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0", 'Accept-Encoding': 'gzip, deflate, sdch', "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def fetchStockholderDetail(self, module_super): """ 遍历股东详情 :param module_super: :return: """ iterator = Iterator(seeds="gdxq_list", param_name="gdxq") module = Module(iterator=iterator, name=u"遍历股东详情") module_super.appendSubModule(module) sub_module = Module(self.visitGdxq, u"抓取股东详情") def getURL(gdxq): if gdxq: gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq) if gdxq_text: return "http://218.95.241.36" + gdxq_text[0] return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
def fetchPunishInfo(self, module_super): """ 遍历行政处罚 :param module_super: :return: """ iterator = Iterator(seeds="xzcf_page_range", param_name="pno") module = Module(iterator=iterator, name=u"遍历行政处罚翻页") module_super.appendSubModule(module) sub_module = Module(self.visitXzcf, u"抓取行政处罚信息") sub_module.appendUrl( lambda pno, company_id: "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" % (pno, company_id, str(random.random()))) sub_module.appendHeaders( lambda company_id: { "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", 'Referer': 'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str( company_id), }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50)) module.appendSubModule(sub_module)
# -*- coding: utf-8 -*-