Beispiel #1
0
    def initShareHolderInfoPage(self, module_super):
        iterator = Iterator("gdxx_pages", "page_no")
        module = Module(None, u"进入股东翻页", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxx, u"获取股东翻页信息")
        sub_module.appendUrl(
            lambda qyid:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxxGdxx.pt?qyid=%s"
            % qyid)
        sub_module.appendWebMethod("post")
        sub_module.appendPostData(lambda page_no: {
            'page': page_no,
            'limit': 5,
            'mark': 0
        })  #  mark可取0,-1,1 取值视点击的顺序定
        sub_module.appendHeaders(
            lambda ua: {
                'Host': 'gsxt.jxaic.gov.cn',
                'Connection': 'keep-alive',
                'User-Agent': ua,
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
            })
        module.appendSubModule(sub_module, True)
Beispiel #2
0
    def getAnnalsInfo(self, module_super):
        """
        遍历年报列表
        :param module_super:
        :return:
        """
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://218.95.241.36' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Beispiel #3
0
    def getAnnalsInfo(self, module_super):
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://gsxt.hljaic.gov.cn' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Beispiel #4
0
    def initShareholderInfoDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, u"开始获取股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, u"获取股东详情信息")
        def getGdxqUrl(gdxx_rcd):
            for key in gdxx_rcd:
                if 'onclick' in gdxx_rcd[key]:
                    onclick_dict = eval(gdxx_rcd[key]) if isinstance(gdxx_rcd[key], basestring) else gdxx_rcd[key]
                    onclick = onclick_dict["onclick"]
                    xq_link = onclick[onclick.find('(')+1:onclick.find(')')].replace("'", "")
                    xq_url = "http://aic.hainan.gov.cn:1888" + xq_link
                    return xq_url
            return None
        sub_module.appendUrl(getGdxqUrl)
        sub_module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
Beispiel #5
0
    def initGdxq(self, module_super):
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")
        sub_module.appendUrl("gdxq")
        module.appendHeaders(
            lambda gdxq: {
                'Host': 'www.sgs.gov.cn',
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                'Accept-Encoding': 'gzip, deflate',
                'Referer': gdxq
            })
        module.appendSubModule(sub_module)
Beispiel #6
0
    def initShareHolderDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, "进入股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, "获取股东详情信息")
        sub_module.appendUrl(self.getGdxqUrl)
        sub_module.appendHeaders({
            'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Connection': 'keep-alive',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
            'Host': 'gsxt.jxaic.gov.cn',
            'Cache-Control': 'max-age=0'
        })
        module.appendSubModule(sub_module)
Beispiel #7
0
 def initGdxqInfoPrepare(self, module_super):
     iterator = Iterator("recid_list", "rid")
     module = Module(None, u"进入股东详情", iterator)
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东详情")
     # def pri_c(rid, com):
     #     print 'xxxxxx===>>>', rid
     # sub_module.appendOutput(type=OutputType.FUNCTION, function=pri_c)
     sub_module.appendUrl(lambda rid, com: 'http://218.57.139.24/pub/gsnzczxxdetail/%s/%s'%(com[2], rid.strip()))
     sub_module.appendHeaders(lambda ua, com: {
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Referer':'http://218.57.139.24/pub/'+com[1],})
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module.appendSubModule(sub_module, True)
Beispiel #8
0
    def initShareHolderDetail(self, module_super):
        iterator = Iterator("gdxx_list", "gdxx_rcd")
        module = Module(None, u"进入股东详情", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxq, u"获取股东详情信息")
        sub_module.appendUrl(self.getGdxqUrl)
        sub_module.appendHeaders(
            lambda ua: {
                'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Connection': 'keep-alive',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'User-Agent': ua,
                'Host': 'gsxt.jxaic.gov.cn',
                'Cache-Control': 'max-age=0'
            })
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.appendSubModule(sub_module, True)
Beispiel #9
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
Beispiel #10
0
    def fetchCmpnyGdxq(self, module_super):
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        # TODO: 添加try exception
        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
Beispiel #11
0
    def initBranchInfoPage(self, module_super):
        iterator = Iterator("fzjg_page_range", "pno")
        module = Module(None, u"第八步_获取分支机构_开始翻页数据", iterator)
        module.module_id = "fzjg_pages"
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitFzjg, u"第八步_获取分支机构翻页数据")
        sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryChildList.jspx?mainId=%s&pno=%s" %(id, pno))
        sub_module.appendHeaders(lambda company_url:
            {
                'Host': "aic.hainan.gov.cn:1888",
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'Referer': "http://aic.hainan.gov.cn:1888" + company_url,
                'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8'
            }
        )
        module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
Beispiel #12
0
    def initRouter(self, module_super):
        module = Module(None, "广东公司适配", router=Router())

        def source_prepare(company_url):
            source = ''
            if 'gsxt.gzaic.gov.cn' in company_url:
                source = u"企业信用网"
            elif '/GSpublicity/' in company_url:
                source = u"企业信息网"
            elif 'szcredit' in company_url:
                source = u"深圳信用网"
            else:
                source = u"企业信用网"

            self.page_dict['source'] = source

            return {"source": source}

        module.appendInput(InputType.FUNCTION, source_prepare)

        qyxx = CrawlerGdQyxx(self.pinyin, self)
        module.appendSubModule(qyxx.module_manager.getFirstModule())
        qyxy = CrawlerGdQyxy(self.pinyin, self)
        module.appendSubModule(qyxy.module_manager.getFirstModule())
        szxy = CrawlerSzxy(self.pinyin, self)
        module.appendSubModule(szxy.module_manager.getFirstModule())
        def shenzhenAssert(source):
            if not source or source == u"深圳信用网":
                self.report.access_type = SeedAccessType.NO_TARGET_SOURCE
                return False
            return True
        module.addEvent(Event(event_type=EventType.ASSERT_FAILED, retry_times=0, assert_function=shenzhenAssert))
        module_super.appendSubModule(module, True)
Beispiel #13
0
    def initShareholderInfoPage(self, module_super):
        iterator = Iterator("gdxx_page_range", "pno")
        module = Module(None, u"第五步_获取股东信息_开始翻页数据", iterator)
        module.module_id = "gdxx_pages"
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitGdxx, u"第五步_获取股东信息翻页数据")
        sub_module.appendUrl(lambda id, pno: "http://aic.hainan.gov.cn:1888/QueryInvList.jspx?mainId=%s&pno=%s" % (id, pno))
        sub_module.appendHeaders(lambda company_url:
            {
                'Host': "aic.hainan.gov.cn:1888",
                'Connection': 'keep-alive',
                'Cache-Control': 'max-age=0',
                'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer': "http://aic.hainan.gov.cn:1888" + company_url,
                'Accept-Encoding': 'gzip, deflate',
                "Accept-Language": "zh-CN,zh;q=0.8"
            }
        )
        module.addSleep(Sleep(2))
        module.appendSubModule(sub_module)
Beispiel #14
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" %
            (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Accept':
                '*/*',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept-Encoding':
                'gzip, deflate, sdch',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Beispiel #15
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return "http://218.95.241.36" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Beispiel #16
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s"
            % (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Beispiel #17
0
# -*- coding: utf-8 -*-