def initAnnualReport(self, module_super): iterator = Iterator(seeds="nb_list",param_name="nb") module=Module(None, u"遍历企业年报列表获取Url",iterator) module_super.appendSubModule(module) # self.initAnnualReportInfo(module) # # def initAnnualReportInfo(self, module_super): # module = Module(self.visitQynb, u"获取企业年报详细信息") sub_module = Module(self.visitQynb, u"获取企业年报详细信息") def annual_convert(nb): con_dict = dict() con_dict["nb_url"] = "http://aic.hainan.gov.cn:1888%s" % ''.join(nb.xpath("@href")) con_dict["nb_name"] = ''.join(nb.xpath("text()")).replace(u"年度报告", "") return con_dict sub_module.appendInput(InputType.FUNCTION, input_value=annual_convert) sub_module.appendUrl("nb_url") sub_module.appendHeaders( lambda company_url: { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "aic.hainan.gov.cn:1888", "Referer": company_url, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36" } ) sub_module.addSleep(Sleep(2)) module.appendSubModule(sub_module)
def initNbiter(self, module_super): iterator = Iterator("qynb_list", "nianb") module = Module(None, u"获取公司年报", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"获取年报详情") def prepare(nianb): mv_dict = dict() mv_dict['nb_url'] = nianb[0] mv_dict['nb_name'] = nianb[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepare) sub_module.appendUrl('nb_url') sub_module.appendHeaders({ 'Host': 'www.sgs.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }) module.appendSubModule(sub_module)
def initRouter(self, module_super): module = Module(None, "广东公司适配", router=Router()) def source_prepare(company_url): source = '' if 'gsxt.gzaic.gov.cn' in company_url: source = u"企业信用网" elif '/GSpublicity/' in company_url: source = u"企业信息网" elif 'szcredit' in company_url: source = u"深圳信用网" else: source = u"企业信用网" self.page_dict['source'] = source return {"source": source} module.appendInput(InputType.FUNCTION, source_prepare) qyxx = CrawlerGdQyxx(self.pinyin, self) module.appendSubModule(qyxx.module_manager.getFirstModule()) qyxy = CrawlerGdQyxy(self.pinyin, self) module.appendSubModule(qyxy.module_manager.getFirstModule()) szxy = CrawlerSzxy(self.pinyin, self) module.appendSubModule(szxy.module_manager.getFirstModule()) def shenzhenAssert(source): if not source or source == u"深圳信用网": self.report.access_type = SeedAccessType.NO_TARGET_SOURCE return False return True module.addEvent(Event(event_type=EventType.ASSERT_FAILED, retry_times=0, assert_function=shenzhenAssert)) module_super.appendSubModule(module, True)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, "基本信息") def prepare(company_url): query_ = {} for qq in map(lambda x: x.split("="), urlparse.urlparse(company_url).query.split("&")): query_[qq[0]] = qq[1] print query_ return query_ module.appendInput(InputType.FUNCTION, prepare) module.appendUrl( lambda qyid, zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxx.pt?qyid=%s&zch=%s&qylx=%s&num=undefined&showgdxx=true" % (qyid, zch, qylx)) module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', 'Host': 'gsxt.jxaic.gov.cn' }) module_super.appendSubModule(module, True)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, u"基本信息") def prepare(company_url): query_ = {} for qq in map(lambda x: x.split("="), urlparse.urlparse(company_url).query.split("&")): query_[qq[0]] = qq[1] return query_ module.appendInput(InputType.FUNCTION, prepare) def assertReqArgs(zch): #断言参数是否合法 return True if zch else False module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=5, assert_function=assertReqArgs, redo_module="module_validate_code")) module.appendOutput(name='company_zch', type=OutputType.FUNCTION, function=lambda zch: zch.strip(), show_up=OutputParameterShowUpType.OPTIONAL) module.appendUrl( lambda qyid, zch, qylx: "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxx.pt?qyid=%s&zch=%s&qylx=%s&num=undefined&showgdxx=true" % (qyid, zch, qylx)) module.appendHeaders( lambda ua, qylx, qyid, zch: { 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': ua, 'Referer': 'http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/ccjcgs_ccjcgsIndexDetail.pt?qylx=%s&qyid=%s&zch=%s&tabName=1' % (qylx, qyid, zch), 'Host': 'gsxt.jxaic.gov.cn' }) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module="module_validate_code")) module_super.appendSubModule(module, True)
def getAnnalsInfo(self, module_super): """ 遍历年报列表 :param module_super: :return: """ iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://218.95.241.36' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def getAnnalsInfo(self, module_super): iterator = Iterator("annals_list", "annals") module = Module(None, u"遍历年报列表", iterator) module_super.appendSubModule(module, True) sub_module = Module(self.visitQynb, u"抓取年报详情") sub_module.module_id = "get_annals_info" def prepareParams(annals): mv_dict = dict() if annals and len(annals) >= 2: mv_dict['nb_url'] = annals[0] mv_dict['nb_name'] = annals[1] return mv_dict sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams) def getURL(nb_url=None): if nb_url: return u'http://gsxt.hljaic.gov.cn' + nb_url return None sub_module.appendUrl(getURL) sub_module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) sub_module.appendEncoding("utf-8") sub_module.addSleep(Sleep(3)) sub_module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_info")) module.appendSubModule(sub_module)
def initConfigShareHolderInfo(self, module_super): module = Module(self.visitGdxxJson, u"股东信息") # 为模块动态添加输入 def prepare(gdxx_text, csrf, params): if gdxx_text: module.appendWebContent("gdxx_text") return module.appendUrl("http://211.141.74.198:8081/aiccips/pub/gsczxx") module.appendHeaders({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '211.141.74.198:8081', 'X-CSRF-TOKEN': csrf[-1] }) module.appendWebMethod("post") module.appendPostData({'encrpripid': params[0]}) module.appendCookie("cookie") module.appendInput(InputType.FUNCTION, prepare) module.addMapper({ 'blicno': u'股东信息.证照或证件号码', 'inv': u'股东信息.股东', 'blictype': u'股东信息.证照或证件类型', 'invtype': u'股东信息.股东类型', 'primary_key': 'inv,blicno' }) def parse4bgxx(script): if not script or not isinstance(script, list) or len(script) < 2: return None bgxx_text = dataretrieve.regex_parse( {'regex': 'bgsxliststr =\'(.*)\''}, self.holder.logging, script[1]) if not bgxx_text: bgxx_text = dataretrieve.regex_parse( {'regex': 'bgsxliststr =\'(.*)\''}, self.holder.logging, script[2]) return {"bgxx_text": bgxx_text} module.appendOutput(type=OutputType.FUNCTION, function=parse4bgxx) module_super.appendSubModule(module, True)
def initAnnalsDetails(self, module_super): module = Module(self.visitQynb, u"获取年报详情") module.module_id = "get_annals_detail" def prepare(annal): query_dict = {} if annal and len(annal) >= 2: query_dict["annals_url"] = str(annal[0]) name = str(annal[1].strip('\r\n\t')) query_dict["nb_name"] = filter(str.isdigit, name) return query_dict module.appendInput(InputType.FUNCTION, prepare) def getUrl(annals_url): if "http" in annals_url: return annals_url else: return u'http://211.141.74.198:8081/' + annals_url module.appendUrl(getUrl) module.appendHeaders({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': '211.141.74.198:8081' }) module.appendWebMethod("get") module.appendCookie("cookie") module.addSleep(Sleep(3)) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_list")) module_super.appendSubModule(module)
def initBasicInfo(self, module_super): module = Module(self.visitJbxx, u"第四步_获取基本信息") def getparams(company_url): query = {} for quy in map(lambda par: par.split("="), urlparse.urlparse(company_url).query.split("&")): query[quy[0]] = quy[1] print query return query module.appendInput(InputType.FUNCTION, getparams) module.appendUrl(lambda id: "http://aic.hainan.gov.cn:1888/businessPublicity.jspx?id=%s" % id) module.appendHeaders( { "Host": "aic.hainan.gov.cn:1888", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://aic.hainan.gov.cn:1888/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" } ) module.addSleep(Sleep(2)) module_super.appendSubModule(module, True) # 股东信息页码获取 def getGdxxPageno(html): fenye_xpath = ".//div[@id='invDiv']/following-sibling::table[1]|.//div[@id='invDiv']/following-sibling::div[1]" gdxx_tree = etree.HTML(html) fenye_table = gdxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] self.holder.logging.info("------------------股东信息页码:" + str(pageno) + "---------------------") return range(2, int(pageno) + 1) module.appendOutput(name="gdxx_page_range", type=OutputType.FUNCTION, function=getGdxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_gdxx(gdxx_page_range=None): if not gdxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_gdxx, module_id="gdxx_pages")) # 变更信息页码获取 def getBgxxPageno(html): fenye_xpath = ".//div[@id='altPagination']/table[1]|.//div[@id='altDiv']/following-sibling::table[1]" bgxx_tree = etree.HTML(html) fenye_table = bgxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="bgxx_page_range", type=OutputType.FUNCTION, function=getBgxxPageno, show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_bgxx(bgxx_page_range=None): if not bgxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_bgxx,module_id="bgxx_pages")) # 备案信息页码提取 def getbaxxPageno(html): fenye_xpath = ".//div[@id='memDiv']/following-sibling::table[1]|.//*[@id='beian']/table[2]" baxx_tree = etree.HTML(html) fenye_table = baxx_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="baxx_page_range",type=OutputType.FUNCTION,function=getbaxxPageno,show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_baxx(baxx_page_range=None): if not baxx_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_baxx,module_id="baxx_pages")) # 备案信息url区分,如果没有备案信息数据则不请求分页网址 # def getbaxx_url(html): # data_xpath = ".//div[@id='memDiv']" # tree = etree.HTML(html) # data_trs = tree.xpath(data_xpath) # if not data_trs: # return True # else: # return False # module.appendBypass(Bypass(condition_fuc=getbaxx_url,module_id="baxx_pages")) # 分支机构页码提取 def getfzjgPageno(html): fenye_xpath = ".//div[@id='childPagination']/table[1]|.//div[@id='childDiv']/following-sibling::table[1]" fzjg_tree = etree.HTML(html) fenye_table = fzjg_tree.xpath(fenye_xpath) if not fenye_table: return [] fenye_table = fenye_table[0] pageno = self.parse_pageno(fenye_table) if pageno <= 1: return [] return range(2, pageno + 1) module.appendOutput(name="fzjg_page_range",type=OutputType.FUNCTION,function=getfzjgPageno,show_up=OutputParameterShowUpType.OPTIONAL) def bypass_fun_fzjg(fzjg_page_range=None): if not fzjg_page_range: return True else: return False module.appendBypass(Bypass(condition_fuc=bypass_fun_fzjg,module_id="fzjg_pages"))
def getAnnalsList(self, module_super): module = Module(self.getWebHtml, u"抓取年报列表") module.module_id = "get_annals_list" def prepareParams(company_url): query_dict = {} if company_url: query_dict["url_id"] = company_url.split("=")[1] return query_dict module.appendInput(InputType.FUNCTION, prepareParams) def getURL(url_id=None): if url_id: return u'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=' + url_id return None module.appendUrl(getURL) module.appendHeaders({ "Host": "gsxt.hljaic.gov.cn", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Referer": "http://gsxt.hljaic.gov.cn/searchList.jspx" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getAnnalsList(html=None): qynb_list = [] try: tree = etree.HTML(html) _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a") for ll in _list: url = ''.join(ll.xpath('@href')).strip() name = ''.join(ll.xpath('text()')).replace(u'年度报告', '') if name != u'详情': qynb_list.append([url, name]) except: qynb_list = [] return qynb_list module.appendOutput(name='annals_list', type=OutputType.FUNCTION, function=getAnnalsList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent( Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="get_annals_list")) module_super.appendSubModule(module, True)
def initConfigBaseInfo(self, module_super): module = Module(self.visitJbxx, u"基本信息") def prepare(com): query_ = {} if com and len(com) >= 2: query_["company_url"] = com[0] query_["search_company"] = com[1] return query_ module.appendInput(InputType.FUNCTION, prepare) def getUrl(company_url): if "http" in company_url: return company_url else: return u'http://211.141.74.198:8081/aiccips/pub/' + company_url module.appendUrl(getUrl) module.appendHeaders({ 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4', 'Accept-Encoding': 'gzip, deflate, sdch', 'Host': '211.141.74.198:8081', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0' }) module.appendOutput(name="script", xpath=".//script/text()", type=OutputType.LIST) def parse(script): if not script or not isinstance(script, list): return None url_params_conf = { 'regex': ['encrpripid = \'(.*)\'', 'enttype=\'(.*)\''] } params = dataretrieve.regex_parse(url_params_conf, self.holder.logging, script[0]) gdxx_text = dataretrieve.regex_parse( {'regex': "czxxliststr ='(.*)'"}, self.holder.logging, script[1]) if gdxx_text: gdxx_text = gdxx_text.replace( "\"inv\":null", "\"inv\":\"null\"").replace( "\"blicno\":null", "\"blicno\":\"null\"").replace( "\"blictype\":null", "\"blictype\":\"null\"").replace( "\"invtype\":null", "\"invtype\":\"null\"") return {"params": params, "gdxx_text": gdxx_text} module.appendOutput(type=OutputType.FUNCTION, function=parse) # 页面有表格及表头但无实际内容的情况 def noContentAssert(): if 'company' in self.result_dict: rows = self.result_dict['company'] for row in rows: for k, v in row.items(): if v and v.strip(): return True if rows: time.sleep(3600) # 休眠一小时 return False return True module.addEvent( Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=noContentAssert)) module_super.appendSubModule(module, True)
# -*- coding: utf-8 -*-
def getAnnalsList(self, module_super): """ 抓取年报列表 :param module_super: :return: """ module = Module(self.getWebHtml, u"抓取年报列表") module.module_id = "get_annals_list" def prepareParams(company_url): query_dict = {} if company_url: query_dict["url_id"] = company_url.split("=")[1] return query_dict module.appendInput(InputType.FUNCTION, prepareParams) def getURL(url_id=None): if url_id: return u'http://218.95.241.36/enterprisePublicity.jspx?id=' + url_id return None module.appendUrl(getURL) module.appendHeaders({ "Host": "218.95.241.36", "Proxy-Connection": "keep-alive", "Cache-Control": "max-age=0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36", "Referer": "http://218.95.241.36/searchList.jspx", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8" }) module.appendEncoding("utf-8") module.addSleep(Sleep(3)) def getAnnalsList(html=None): qynb_list = [] try: tree = etree.HTML(html) _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a") for ll in _list: url = ''.join(ll.xpath('@href')).strip() name = ''.join(ll.xpath('text()')).replace(u'年度报告', '') if name != u'详情': qynb_list.append([url, name]) except Exception as e: self.holder.logging.warning(u"获取annals_list失败: %s" % e) qynb_list = [] return qynb_list module.appendOutput(name='annals_list', type=OutputType.FUNCTION, function=getAnnalsList, show_up=OutputParameterShowUpType.OPTIONAL) module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100)) module_super.appendSubModule(module, True)