コード例 #1
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
 def initAnnualReportPre(self, module_super):
     module = Module(self.getWebHtml, u"获取年报年份列表")
     module.module_id = "fetch_qynb_list"
     module.appendUrl(lambda com :"http://218.57.139.24/pub/qygsdetail/%s/%s"%(com[3], com[2]))
     module.appendHeaders(lambda ua, com: {
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Host": "218.57.139.24",
         "Referer": "http://218.57.139.24/pub/"+com[1],
         "Connection": "keep-alive"})
     module.appendOutput("qynb_search_parms", '//table/tr/td/a', OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL)
     def getQynbParms(qynb_search_parms=None):
         #print 'qynb_search_parms   :', qynb_search_parms
         if not qynb_search_parms:
             return []
         rt_list = []
         for tag_a in qynb_search_parms:
             href = tag_a.xpath('./@href')
             nb_name = tag_a.xpath('./text()')
            # print href, '---->' ,  nb_name
             if href and nb_name:
                 nb_year = re.findall(r'.*?(\d{4}).+?', nb_name[0].strip(), re.S)
              #  print 'nb_year, ', nb_year
                 if nb_year:
                     rt_list.append((nb_year[0].strip(), href[0].strip()))
      #   print 'rt_lsit ', rt_list
         return rt_list
     module.appendOutput(name="qinb_param_list", type=OutputType.FUNCTION, function=getQynbParms, show_up=OutputParameterShowUpType.OPTIONAL)
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module, True)
コード例 #2
0
ファイル: CrawlerJilin.py プロジェクト: chybot/crawler
    def initCookie(self):
        module = Module(self.getWebHtml, u"获取cookie")
        module.module_id = "module_cookie"
        module.appendUrl('http://211.141.74.198:8081/aiccips/')
        module.appendHeaders(
            lambda ua: {
                'Connection': 'keep-alive',
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Cache-Control': 'max-age=0',
                'Referer': 'http://211.141.74.198:8081/aiccips/',
                'Host': '211.141.74.198:8081',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'User-Agent': ua
            })

        def getCookie(html):
            pattern = re.compile(r'\}\(([\s\S]*?\{\})\)\)', re.S)
            result = pattern.search(html).group(1)
            params = result.split(',')
            if len(params) != 6:
                raise Exception("cookie获取失败!")
            strstr = 'var document = {};var window = {};document[\'cookie\'] = "";window[\'location\'] ={}; \
                window[\'location\'][\'reload\'] = function(){};eval(function (p, a, c, k, e, r) { \
            	e = function(c) { \
            		return c.toString(a) \
            	}; \
            	if (!\'\'.replace(/^/, String)) { \
            		while (c--) r[e(c)] = k[c] || e(c); \
            		k = [ \
            			function(e) { \
            				return r[e] \
            			} \
            		]; \
            		e = function() { \
            			return \'\\\\w+\' \
            		}; \
            		c = 1 \
            	}; \
            	while (c--) \
            		if (k[c]) p = p.replace(new RegExp(\'\\\\b\' + e(c) + \'\\\\b\', \'g\'), k[c]); \
            	return p \
            }(' + params[0] + ',' + params[1] + ',' + params[2] + ',' + params[
                3] + ',' + params[4] + ',' + params[5] + ')); \
                challenge();var a = document[\'cookie\'];'

            with PyV8.JSContext() as se:
                se.eval(strstr)
                a = se.locals.a
                cookie = a.split('=')[1].split(';')[0]
                cookie_temp1 = dict({'ROBOTCOOKIEID': cookie})
                return cookie_temp1

        module.appendOutput(name="cookie",
                            type=OutputType.FUNCTION,
                            function=getCookie)
        module.appendMiddleValueMonitor("cookie")
        module.addSleep(Sleep(3))
        self.module_manager.appendSubModule(module, True)
コード例 #3
0
ファイル: CrawlerShanghai.py プロジェクト: chybot/crawler
 def initSearchPage(self, module_super):
     module = Module(self.visitSearchList, u"搜索列表-翻页")
     module.appendUrl('https://www.sgs.gov.cn/notice/search/ent_info_list')
     module.appendHeaders({
         'Host':
         'www.sgs.gov.cn',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language':
         'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept-Encoding':
         'gzip, deflate',
         'Referer':
         'https://www.sgs.gov.cn/notice/home',
         'Content-Type':
         'application/x-www-form-urlencoded'
     })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda token, company_key, page_no: {
             'searchType': '1',
             'captcha': 0,
             'session.token': token,
             'condition.keyword': company_key,
             'condition.pageNo': page_no
         })
     module.appendOutput("search_list", './/div[@class="list-item"]',
                         OutputType.LIST)
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=2))
     module_super.appendSubModule(module)
コード例 #4
0
ファイル: CrawlerShanghai.py プロジェクト: chybot/crawler
    def initToken(self):
        module = Module(self.getWebHtml, u"令牌获取")
        module.module_id = "module_token"
        module.appendUrl('https://www.sgs.gov.cn/notice/search/popup_captcha')
        module.appendHeaders({
            'Host': 'www.sgs.gov.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://www.sgs.gov.cn/notice/home'
        })
        module.appendEncoding("utf-8")

        def getToken(html):
            if not html or '\"session.token\": \"' not in html:
                self.holder.logging.error(u'获取session.token失败!')
                return None
            token = re.search(r'\"session\.token\": \"(.*?)\"', html).group(1)
            if not token:
                self.holder.logging.error(u'提取token失败!')
            self.holder.logging.info('token: %s' % token)
            return token

        module.appendOutput(name="token",
                            type=OutputType.FUNCTION,
                            function=getToken)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
        self.module_manager.appendSubModule(module)
コード例 #5
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
    def initQynbInfo(self, module_super):
        module = Module(None, u"设置年份")

        def saveNbyear(qynb_url):
            self.value_dict['nb_name'] = re.findall(r'nbnd=(\d{4})', qynb_url,
                                                    re.S)[0]

        module.appendOutput(type=OutputType.FUNCTION, function=saveNbyear)
        module_super.appendSubModule(module, True)

        module = Module(self.visitQynb, u"获取企业年报")
        module.appendUrl(lambda qynb_url: qynb_url)
        module.appendHeaders(
            lambda ua, qyid, company_zch, qylx: {
                "User-Agent":
                ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                "Host":
                "gsxt.jxaic.gov.cn",
                "Referer":
                "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/qygs_ViewQynb.pt?qyid=%s&zch=%s&qylx=%s&num=0&showgdxx=true"
                % (qyid, company_zch, qylx),
                "Connection":
                "keep-alive"
            })
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
コード例 #6
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
    def initAnnualReportPre(self, module_super):
        module = Module(self.getWebHtml, u"获取年报年份列表")
        module.module_id = "fetch_qynb_list"
        module.appendUrl(
            lambda qyid, company_zch, qylx:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/qygs_ViewQynb.pt?qyid=%s&zch=%s&qylx=%s&num=0"
            % (qyid, company_zch, qylx))
        module.appendHeaders(
            lambda ua, qylx, qyid, company_zch: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.jxaic.gov.cn",
                "Connection": "keep-alive"
            })

        def getRightUrl(html=None):
            if not html:
                return []
            rs = re.findall(r'<a\s+href="(.*?\?.*?nbnd=\d{4}.*?)"', html, re.S)
            rs = list(set(rs))
            return ['http://gsxt.jxaic.gov.cn' + x for x in rs]

        module.appendOutput(name="qynb_param_list",
                            type=OutputType.FUNCTION,
                            function=getRightUrl,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module_super.appendSubModule(module, True)
コード例 #7
0
    def initNianBao(self, module_super):
        module = Module(self.getWebHtml, u"抓取公司的年报信息")
        module.appendUrl(
            lambda company_url: company_url.replace('tab=01', 'tab=02'))
        module.appendHeaders({
            'Host': 'www.sgs.gov.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        })

        def xpaths(html):
            tree = etree.HTML(html)
            _list = tree.xpath('.//*[@class="info m-bottom m-top"]/tr/td/a')
            qynb_list = []
            for ll in _list:
                url = ''.join(ll.xpath('@href')).strip()
                name = ''.join(ll.xpath('text()')).replace(u'年度报告', '')
                if name != u'详情':
                    qynb_list.append([url, name])
            return qynb_list

        module.appendOutput(name='qynb_list',
                            type=OutputType.FUNCTION,
                            function=xpaths,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module_super.appendSubModule(module, True)
コード例 #8
0
ファイル: CrawlerJilin.py プロジェクト: chybot/crawler
 def initConfigHomePage(self):
     module = Module(self.visitHomePage, u"首页")
     module.module_id = "module_home_page"
     module.appendUrl("http://211.141.74.198:8081/aiccips/")
     module.appendHeaders({
         'Connection':
         'keep-alive',
         'Accept-Language':
         'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
         'Accept-Encoding':
         'gzip, deflate, sdch',
         'Cache-Control':
         'max-age=0',
         'Referer':
         'http://211.141.74.198:8081/aiccips/',
         'Host':
         '211.141.74.198:8081',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0'
     })
     module.appendCookie("cookie")
     module.appendOutput("csrf", ".//input[@name='_csrf']/@value",
                         OutputType.LIST)
     module.appendMiddleValueMonitor("csrf")
     module.addSleep(Sleep(3))
     self.module_manager.appendSubModule(module, True)
コード例 #9
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
    def initUrlParams(self,module_super):
        module = Module(None, u"公司详情链接参数值提取")

        def initReady(com):
            params = {}
            if com and len(com)>=2:
                params["company_url"] = com[0]
                params["search_company"] = com[1]
            return params
        module.appendOutput(type=OutputType.FUNCTION, function=initReady)
        module_super.appendSubModule(module, True)
コード例 #10
0
    def initCompanyInfoPrepare(self, module_super):
        module = Module(None, u"抓取公司前的预处理")

        def prepare(info):
            query_ = dict()
            query_["company_url"] = info.xpath('.//a/@href')[0].strip()
            query_["search_company"] = info.xpath('.//a/text()')[0].strip()
            #query_['zch'] = info.xpath(".//*[@class='profile']/span[1]/text()")[0].strip()
            return query_

        module.appendOutput(type=OutputType.FUNCTION, function=prepare)
        module_super.appendSubModule(module, True)
コード例 #11
0
    def prepareCompnanyParms(self, module_super):
        module = Module(None, u"抓取公司前的预处理")

        def prepareParams(com):
            query_dict = {}
            if com and len(com) >= 2:
                query_dict["company_url"] = com[0]
                query_dict["search_company"] = com[1]
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION, function=prepareParams)
        module_super.appendSubModule(module, True)
コード例 #12
0
ファイル: CrawlerZhejiang.py プロジェクト: chybot/crawler
    def initCompanyInfoPrepare(self, module_super):
        module = Module(None, "抓取公司前的预处理")

        def prepare(com):
            query_ = {}
            if com and len(com) >= 2:
                query_["company_url"] = com[0]
                query_["search_company"] = com[1]
            return query_

        module.appendOutput(type=OutputType.FUNCTION, function=prepare)
        module_super.appendSubModule(module, True)
コード例 #13
0
ファイル: CrawlerQinghai.py プロジェクト: chybot/crawler
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Cache-Control':
                'max-age=0',
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'Accept-Encoding':
                'gzip, deflate',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
コード例 #14
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
    def initConfigBaseInfo(self, module_super):
        module = Module(self.visitJbxx, u"基本信息")

        def prepare(company_url):
            query_ = {}
            for qq in map(lambda x: x.split("="),
                          urlparse.urlparse(company_url).query.split("&")):
                query_[qq[0]] = qq[1]
            return query_

        module.appendInput(InputType.FUNCTION, prepare)

        def assertReqArgs(zch):  #断言参数是否合法
            return True if zch else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=5,
                  assert_function=assertReqArgs,
                  redo_module="module_validate_code"))
        module.appendOutput(name='company_zch',
                            type=OutputType.FUNCTION,
                            function=lambda zch: zch.strip(),
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendUrl(
            lambda qyid, zch, qylx:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxx.pt?qyid=%s&zch=%s&qylx=%s&num=undefined&showgdxx=true"
            % (qyid, zch, qylx))
        module.appendHeaders(
            lambda ua, qylx, qyid, zch: {
                'Accept-Language':
                'en-US,en;q=0.5',
                'Accept-Encoding':
                'gzip, deflate',
                'Connection':
                'keep-alive',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'User-Agent':
                ua,
                'Referer':
                'http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/ccjcgs_ccjcgsIndexDetail.pt?qylx=%s&qyid=%s&zch=%s&tabName=1'
                % (qylx, qyid, zch),
                'Host':
                'gsxt.jxaic.gov.cn'
            })
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=5,
                  redo_module="module_validate_code"))
        module_super.appendSubModule(module, True)
コード例 #15
0
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
コード例 #16
0
 def initConfigSearchList(self):
     module = Module(self.visitSearchList, "搜索列表")
     module.appendUrl('http://gsxt.gdgs.gov.cn/aiccips/CheckEntContext/showInfo.html')
     module.appendHeaders({'Host': 'gsxt.gdgs.gov.cn', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
                           'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/',
                           'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive',
                           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0'})
     module.appendWebMethod("post")
     module.appendPostData("post_data")
     module.appendOutput("search_list", ".//*[@class='list']", OutputType.LIST)
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100, redo_module="module_validate_code"))
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100, redo_module="module_validate_code"))
     module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda:False if self.report.access_type == SeedAccessType.NON_COMPANY else True))
     self.module_manager.appendSubModule(module)
コード例 #17
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
 def initHomePage(self):
     module = Module(self.visitHomePage, u"访问首页")
     module.module_id = "home_page"
     module.appendUrl('http://218.57.139.24')
     module.appendHeaders(lambda ua: {
         "Host": '218.57.139.24',
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "en-US,en;q=0.8",
         "Accept-Encoding": "gzip, deflate, sdch",
         "Connection": "keep-alive",
         'User-Agent': ua})
     module.addSleep(Sleep(2))
     module.appendOutput("csrf", '//form[@id="searchform"]/input[@name="_csrf"]/@value', OutputType.LIST)
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     self.module_manager.appendSubModule(module, True)
コード例 #18
0
    def initConfigCompanyInfoPre(self, module_super):
        module = Module(None, u"抓取公司前的预处理")
        module.module_id = "fetch_company_info"

        def setComParms(tag_a):
            query_ = {
                'company_name': tag_a[0],
                'entbigtype': tag_a[1],
                'pripid': tag_a[2]
            }
            return query_

        module.appendOutput(type=OutputType.FUNCTION,
                            function=setComParms,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module_super.appendSubModule(module, True)
コード例 #19
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
 def initQynbInfo(self, module_super):
     module = Module(None, u"设置年份")
     module.appendOutput(type=OutputType.FUNCTION, function=lambda qynb_tuper: {'nb_name':qynb_tuper[0]})
     module_super.appendSubModule(module, True)
     module = Module(self.visitQynb, u"获取企业年报")
     module.appendUrl(lambda qynb_tuper:"http://218.57.139.24%s" % qynb_tuper[1])
     module.appendHeaders(lambda ua, com:{
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Host": "218.57.139.24",
         "Referer": "http://218.57.139.24/pub/qygsdetail/%s/%s" % (com[3], com[2]),
         "Connection": "keep-alive"})
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module, True)
コード例 #20
0
ファイル: CrawlerJilin.py プロジェクト: chybot/crawler
    def initConfigShareHolderInfo(self, module_super):
        module = Module(self.visitGdxxJson, u"股东信息")

        # 为模块动态添加输入
        def prepare(gdxx_text, csrf, params):
            if gdxx_text:
                module.appendWebContent("gdxx_text")
                return
            module.appendUrl("http://211.141.74.198:8081/aiccips/pub/gsczxx")
            module.appendHeaders({
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Host': '211.141.74.198:8081',
                'X-CSRF-TOKEN': csrf[-1]
            })
            module.appendWebMethod("post")
            module.appendPostData({'encrpripid': params[0]})
            module.appendCookie("cookie")

        module.appendInput(InputType.FUNCTION, prepare)
        module.addMapper({
            'blicno': u'股东信息.证照或证件号码',
            'inv': u'股东信息.股东',
            'blictype': u'股东信息.证照或证件类型',
            'invtype': u'股东信息.股东类型',
            'primary_key': 'inv,blicno'
        })

        def parse4bgxx(script):
            if not script or not isinstance(script, list) or len(script) < 2:
                return None
            bgxx_text = dataretrieve.regex_parse(
                {'regex': 'bgsxliststr =\'(.*)\''}, self.holder.logging,
                script[1])
            if not bgxx_text:
                bgxx_text = dataretrieve.regex_parse(
                    {'regex': 'bgsxliststr =\'(.*)\''}, self.holder.logging,
                    script[2])
            return {"bgxx_text": bgxx_text}

        module.appendOutput(type=OutputType.FUNCTION, function=parse4bgxx)
        module_super.appendSubModule(module, True)
コード例 #21
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
    def initCompanyInfoPrepare(self, module_super):
        module = Module(None, u"抓取公司前的预处理")

        def prepare(com):
            query_ = {}
            if com and len(com) >= 2 and com[0].strip() and com[1].strip():
                query_["company_url"] = com[0].strip()
                query_["company_name"] = com[1].strip()  #修改公司名的key值
            return query_

        module.appendOutput(type=OutputType.FUNCTION, function=prepare)
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=5,
                  redo_module="module_validate_code"))
        module_super.appendSubModule(module, True)
コード例 #22
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
 def initAnnualReportList(self, module_super):
     module = Module(self.visitQynbList, u"第九步_获取企业年报列表")
     module.appendUrl(lambda id: "http://aic.hainan.gov.cn:1888/enterprisePublicity.jspx?id=%s" % id)
     module.appendHeaders(
         lambda company_url:
         {
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN,zh;q=0.8",
             "Cache-Control": "max-age=0",
             "Connection": "keep-alive",
             "Host": "aic.hainan.gov.cn:1888",
             "Referer": company_url,
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
         }
     )
     module.appendOutput(name="nb_list", xpath=".//*[@id='qiyenianbao']/table//td/a", type=OutputType.LIST, show_up=OutputParameterShowUpType.OPTIONAL)
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module)
コード例 #23
0
ファイル: CrawlerJiangxi.py プロジェクト: chybot/crawler
 def initConfigChangeInfo(self, module_super):
     module = Module(self.visitBgxx, u"变更信息")
     module.appendUrl(
         lambda qyid:
         "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
         % qyid)
     module.appendHeaders(
         lambda ua: {
             'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
             'Accept-Encoding': 'gzip, deflate, sdch',
             'Connection': 'keep-alive',
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'User-Agent': ua,
             'Host': 'gsxt.jxaic.gov.cn'
         })
     module.appendOutput("bgxx_pages", None, OutputType.FUNCTION,
                         self.getPageNoPrepare,
                         OutputParameterShowUpType.OPTIONAL)
     module_super.appendSubModule(module, True)
コード例 #24
0
ファイル: CrawlerZhejiang.py プロジェクト: chybot/crawler
 def initConfigChangeInfo(self, module_super):
     module = Module(self.visitBgxx, "变更信息")
     module.appendUrl(
         lambda qyid:
         "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxBgxx.pt?qyid=%s"
         % qyid)
     module.appendHeaders({
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Connection': 'keep-alive',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
         'Host': 'gsxt.jxaic.gov.cn'
     })
     module.appendOutput("bgxx_pages", None, OutputType.FUNCTION,
                         self.getPageNoPrepare,
                         OutputParameterShowUpType.OPTIONAL)
     module_super.appendSubModule(module, True)
コード例 #25
0
    def initCompanyPrepare(self, module_super):
        module = Module(name="抓取公司前的预处理")
        def company_info_prepare(info):
            try:
                company_list_name = info.xpath('.//a/text()')[0].strip()
            except Exception as e:
                company_list_name = ''

            company_url = ''
            company_url_list = info.xpath(".//a")
            if company_url_list:
                if isinstance(company_url_list, list):
                    company_url = company_url_list[0].get("href")

            if '../GSpublicity/' in company_url:
                company_url = 'http://gsxt.gdgs.gov.cn/aiccips' + company_url[2:]

            return {"company_url": company_url, "company_name": company_list_name,
                    "search_company": company_list_name}
        module.appendOutput(type=OutputType.FUNCTION, function=company_info_prepare)
        module_super.appendSubModule(module)
コード例 #26
0
ファイル: CrawlerHainan.py プロジェクト: chybot/crawler
 def initSearchList(self):
     module = Module(self.visitSearchList,u"第三步_开始搜索公司列表")
     module.appendUrl("http://aic.hainan.gov.cn:1888/searchList.jspx")
     module.appendHeaders(
         {
             "Host": "aic.hainan.gov.cn:1888",
             "Connection": "keep-alive",
             "Cache-Control": "max-age=0",
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
             "Origin": "http://aic.hainan.gov.cn:1888",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
             "Content-Type": "application/x-www-form-urlencoded",
             "Referer": "http://aic.hainan.gov.cn:1888/search.jspx",
             "Accept-Encoding": "gzip, deflate",
             "Accept-Language": "zh-CN,zh;q=0.8"
         }
     )
     module.appendWebMethod("post")
     module.appendPostData(lambda yzm, company_key:{
         "checkNo": yzm,
         "entName": company_key
     })
     module.appendOutput("url_list", ".//div[@class='list']//a/@href", OutputType.LIST)
     module.appendOutput("name_list", ".//div[@class='list']//a/text()", OutputType.LIST)
     module.appendOutput(name="search_list", type=OutputType.FUNCTION, function=lambda url_list, name_list: zip(url_list, name_list))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=20, redo_module="hn_yzm_pic"))
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=20, redo_module="hn_yzm_pic"))
     module.addEvent(Event(EventType.ASSERT_FAILED, retry_times=0, assert_function=lambda :False if self.report.access_type == SeedAccessType.NON_COMPANY else True))
     module.appendMiddleValueMonitor("search_list")
     module.addSleep(Sleep(1))
     self.module_manager.appendSubModule(module)
コード例 #27
0
    def initConfigBaseInfo(self, module_super):
        module = Module(self.visitJbxx, u"基本信息")
        module.appendPostData(
            lambda pripid, entbigtype: {
                'djjg': '',
                'maent.entbigtype': entbigtype,
                'maent.pripid': pripid,
                'method': 'qyInfo',
                'random': str(int(time.time() * 1000))
            })
        module.appendWebMethod("post")
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
                "Connection": "keep-alive"
            })
        module.appendOutput("company_zch_list",
                            '//table[1]/tr[2]/td[1]/text()', OutputType.LIST)
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))

        def setCompanyZch(company_zch_list=None):
            self.value_dict['company_zch'] = company_zch_list[
                0] if company_zch_list else None

        module.appendOutput(type=OutputType.FUNCTION, function=setCompanyZch)

        #对公司名字和注册号码断言
        def assertNameZch(company_name=None, company_zch=None):
            if company_name and company_zch and (0 < len(company_name) <
                                                 100) and (0 < len(company_zch)
                                                           < 100):
                #self.report.access_type = SeedAccessType.OK
                return True
            return False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=5,
                  assert_function=assertNameZch))

        #从基本信息页面中提取股东详情的参数,组成访问股东详情的post参数
        def getXhPripid(html):
            return re.findall(r'\s+onclick="showRyxx\(\'(.+?)\'\,\'(.+?)\'\)"',
                              html, re.S)

        module.appendOutput(name='xh_pripid',
                            type=OutputType.FUNCTION,
                            function=getXhPripid,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
コード例 #28
0
ファイル: CrawlerShandong.py プロジェクト: chybot/crawler
 def  initCompanyInfo(self, module_super):
     module = Module(self.visitJbxx, u"基本信息")
     module.appendUrl(lambda com : "http://218.57.139.24/pub/"+com[1])
     # def pri_com(com, company_zch):
     #     print 'COMMMMMMoC  ', com[0],  com[1], company_zch
     module.appendHeaders(lambda ua: {
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Referer':'http://218.57.139.24/'})
     module.appendOutput("company_zch_list", '//table[1]/tr[2]/td[1]/text()', OutputType.LIST)
     def setCompanyZch(company_zch_list=None):
         self.value_dict['company_zch'] = company_zch_list[0].strip() if company_zch_list else None
     module.appendOutput(type=OutputType.FUNCTION, function=setCompanyZch)
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5, redo_module='home_page'))
     def getGdxxParms(html):
         return  re.findall(r'\,"recid":"(.+?)",', html, re.S) if html else []
     module.appendOutput(name="recid_list", type=OutputType.FUNCTION, function=getGdxxParms, show_up=OutputParameterShowUpType.OPTIONAL) #提取股东详情的list
     #module.appendOutput(type=OutputType.FUNCTION, function=pri_com)
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module_super.appendSubModule(module, True)
コード例 #29
0
    def getCmpnySereachList(self):
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, textfield: {
            "code": yzm,
            "textfield": textfield.replace(r"\n", "")
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
コード例 #30
0
    def initAnnualReportPre(self, module_super):
        module = Module(self.getWebHtml, u"获取年报年份列表")
        module.module_id = "fetch_qynb_list"
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
                "Connection": "keep-alive"
            })
        module.appendWebMethod("post")

        def getQynbParamList(html):
            if html:
                rs = re.findall(r'\s+onclick="doNdbg\(\'(\d{4})\'\);"', html,
                                re.S)
                return list(set(rs))
            return []

        module.appendPostData(
            lambda pripid: {
                'czmk': 'czmk8',
                'maent.pripid': pripid,
                'method': 'qygsInfo',
                'random': str(int(time.time() * 1000))
            })
        module.appendOutput(name="qinb_param_list",
                            type=OutputType.FUNCTION,
                            function=getQynbParamList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)