Example #1
0
 def initShareHolderDetail(self, module_super):
     iterator = Iterator("xh_pripid", "xh_prid")
     module = Module(None, "进入股东详情", iterator)
     module.module_id = "fetch_gdxq_info"
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东翻页信息")
     sub_module.appendUrl('http://gsxt.scaic.gov.cn/ztxy.do')
     sub_module.appendHeaders(
         lambda ua: {
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Host": "gsxt.scaic.gov.cn",
             "Origin": "http://gsxt.scaic.gov.cn",
             "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
             "Connection": "keep-alive"
         })
     sub_module.appendWebMethod("post")
     sub_module.appendPostData(
         lambda xh_prid: {
             'maent.pripid': xh_prid[1],
             'maent.entbigtype': xh_prid[0],
             'random': str(int(time.time() * 1000)),
             'method': 'tzrCzxxDetial',
             'random': str(int(time.time() * 1000))
         })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module.appendSubModule(sub_module, True)
Example #2
0
    def initSubmitYzm(self):
        module = Module(self.getJson, u"第二步_提交验证码验证")
        module.appendUrl("http://aic.hainan.gov.cn:1888/checkCheckNo.jspx")
        module.appendHeaders(
            {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "Accept-Encoding": "gzip, deflate",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
                "Host": "aic.hainan.gov.cn:1888",
                "Origin": "http://aic.hainan.gov.cn:1888",
                "Proxy-Connection": "keep-alive",
                "Referer": "http://aic.hainan.gov.cn:1888/search.jspx",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
                "X-Requested-With": "XMLHttpRequest"
            }
        )
        module.appendWebMethod("post")
        module.appendPostData(lambda yzm: {"checkNo": yzm})
        module.addSleep(Sleep(3))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=0, redo_module="hn_yzm_pic"))

        # 验证码判断条件
        def submitYzmResult(json=None):
            if not json:
                return False
            if '{success:true}' not in json:
                return False
            return True
        module.addEvent(Event(EventType.ASSERT_FAILED, assert_function=submitYzmResult, retry_times=0, redo_module="hn_yzm_pic"))
        self.module_manager.appendSubModule(module)
Example #3
0
    def initToken(self):
        module = Module(self.getWebHtml, u"令牌获取")
        module.module_id = "module_token"
        module.appendUrl('https://www.sgs.gov.cn/notice/search/popup_captcha')
        module.appendHeaders({
            'Host': 'www.sgs.gov.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://www.sgs.gov.cn/notice/home'
        })
        module.appendEncoding("utf-8")

        def getToken(html):
            if not html or '\"session.token\": \"' not in html:
                self.holder.logging.error(u'获取session.token失败!')
                return None
            token = re.search(r'\"session\.token\": \"(.*?)\"', html).group(1)
            if not token:
                self.holder.logging.error(u'提取token失败!')
            self.holder.logging.info('token: %s' % token)
            return token

        module.appendOutput(name="token",
                            type=OutputType.FUNCTION,
                            function=getToken)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
        self.module_manager.appendSubModule(module)
Example #4
0
    def initRouter(self, module_super):
        module = Module(None, "广东公司适配", router=Router())

        def source_prepare(company_url):
            source = ''
            if 'gsxt.gzaic.gov.cn' in company_url:
                source = u"企业信用网"
            elif '/GSpublicity/' in company_url:
                source = u"企业信息网"
            elif 'szcredit' in company_url:
                source = u"深圳信用网"
            else:
                source = u"企业信用网"

            self.page_dict['source'] = source

            return {"source": source}

        module.appendInput(InputType.FUNCTION, source_prepare)

        qyxx = CrawlerGdQyxx(self.pinyin, self)
        module.appendSubModule(qyxx.module_manager.getFirstModule())
        qyxy = CrawlerGdQyxy(self.pinyin, self)
        module.appendSubModule(qyxy.module_manager.getFirstModule())
        szxy = CrawlerSzxy(self.pinyin, self)
        module.appendSubModule(szxy.module_manager.getFirstModule())
        def shenzhenAssert(source):
            if not source or source == u"深圳信用网":
                self.report.access_type = SeedAccessType.NO_TARGET_SOURCE
                return False
            return True
        module.addEvent(Event(event_type=EventType.ASSERT_FAILED, retry_times=0, assert_function=shenzhenAssert))
        module_super.appendSubModule(module, True)
Example #5
0
    def initConfigValidateCode(self):
        module = Module(self.visitValidateCode, u'获取验证码图片')
        module.module_id = "check_validatecode"
        module.appendUrl(
            "http://gsxt.scaic.gov.cn/ztxy.do?method=createYzm&dt=" +
            str(int(time.time())) + "&random=" + str(int(time.time())))
        module.appendHeaders(
            lambda ua: {
                "Host": "gsxt.scaic.gov.cn",
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Accept-Encoding": "gzip, deflate, sdch",
                "Connection": "keep-alive",
                'Cache-Control': 'max-age=0',
                'Upgrade-Insecure-Requests': 1
            })

        #对验证码进行简单的断言
        def assertYzm(yzm=None):
            print 'Yzm  , ', yzm
            if isinstance(yzm, int):
                return True
            return True if yzm else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=assertYzm,
                  redo_module="check_validatecode"))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addSleep(Sleep(2))
        self.module_manager.appendSubModule(module, True)
Example #6
0
    def initConfigValidateCode(self):
        module = Module(self.visitValidateCode, u"获取验证码")
        module.module_id = "init_validate_code"

        module.appendUrl(
            "http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0&id=" +
            str(random.random()))
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "image/png,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")

        def checkValidatecode(yzm):
            if not yzm:
                self.holder.logging.warning(u"获取验证码失败")
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode))

        self.module_manager.appendSubModule(module, True)
Example #7
0
    def precheckCmpyKey(self):
        """
        检查查询条件:
            1. 长度
            2. 非法字符
        :return:
        """
        module = Module(None, u"查询条件检查")

        def checkCmpyKey(company_key):
            try:
                if len(company_key) < 2 or len(company_key) > 60:
                    self.holder.logging.warning(u"查询条件长度不能小于2个字符且不能大于60个字符!")
                    return False
                if filter(lambda x: company_key.find(x) > 0,
                          [",", "'", '"', "<", ">", ";", "_"]):
                    return False
                company_key = company_key.replace(u"(", "").replace(u")", "")
                if not re.match(u"^(\w|[\u4E00-\u9FA5])*$", company_key):
                    self.holder.logging.warning(u"查询条件中含有非法字符!")
                    return False
                return True
            except:
                return False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=2,
                  assert_function=checkCmpyKey))

        self.module_manager.appendSubModule(module)
Example #8
0
 def initPenaltyInfo(self, module_super):
     module = Module(self.visitXzcf, u"获取行政处罚信息")
     module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
     module.appendHeaders(
         lambda ua: {
             "User-Agent": ua,
             "Accept":
             "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
             "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
             "Accept-Encoding": "gzip, deflate",
             "Host": "gsxt.scaic.gov.cn",
             "Origin": "http://gsxt.scaic.gov.cn",
             "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
             "Connection": "keep-alive"
         })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda pripid, entbigtype: {
             'czmk': 'czmk3',
             'maent.entbigtype': entbigtype,
             'maent.pripid': pripid,
             'method': 'cfInfo',
             'random': str(int(time.time() * 1000))
         })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module, True)
Example #9
0
    def checkCompanyName(self):
        module = Module(self.getWebHtml, u'验证公司名称')
        module.appendUrl(
            "http://gsxt.scaic.gov.cn/keyword.do?method=keywordFilter&random="
            + str(int(time.time())))
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Origin": "http://gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do?method=index",
                "Connection": "keep-alive"
            })
        module.appendPostData(lambda company_key: {'qymc': company_key})
        module.appendWebMethod('post')

        def assertRecode(html):
            if self.report.access_type == SeedAccessType.NON_COMPANY:
                self.report.access_type = SeedAccessType.ERROR
            return True if html and html.strip() == '1' else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=assertRecode,
                  redo_module="check_validatecode"))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addSleep(Sleep(3))
        self.module_manager.appendSubModule(module, True)
Example #10
0
    def initQynbInfo(self, module_super):
        module = Module(None, u"设置年份")

        def saveNbyear(qynb_url):
            self.value_dict['nb_name'] = re.findall(r'nbnd=(\d{4})', qynb_url,
                                                    re.S)[0]

        module.appendOutput(type=OutputType.FUNCTION, function=saveNbyear)
        module_super.appendSubModule(module, True)

        module = Module(self.visitQynb, u"获取企业年报")
        module.appendUrl(lambda qynb_url: qynb_url)
        module.appendHeaders(
            lambda ua, qyid, company_zch, qylx: {
                "User-Agent":
                ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                "Host":
                "gsxt.jxaic.gov.cn",
                "Referer":
                "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/qygs_ViewQynb.pt?qyid=%s&zch=%s&qylx=%s&num=0&showgdxx=true"
                % (qyid, company_zch, qylx),
                "Connection":
                "keep-alive"
            })
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
Example #11
0
    def initConfigValidateCode(self):
        module = Module(self.visitValidateCode, u"获取验证码")
        module.module_id = "init_validate_code"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/verify.html?random=" +
            str(random.random()))
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Accept':
            'image/webp,*/*;q=0.8',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate, sdch',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")

        def checkValidatecode(yzm):
            if not yzm:
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode))

        self.module_manager.appendSubModule(module, True)
Example #12
0
 def  initCompanyInfo(self, module_super):
     module = Module(self.visitJbxx, u"基本信息")
     module.appendUrl(lambda com : "http://218.57.139.24/pub/"+com[1])
     # def pri_com(com, company_zch):
     #     print 'COMMMMMMoC  ', com[0],  com[1], company_zch
     module.appendHeaders(lambda ua: {
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Referer':'http://218.57.139.24/'})
     module.appendOutput("company_zch_list", '//table[1]/tr[2]/td[1]/text()', OutputType.LIST)
     def setCompanyZch(company_zch_list=None):
         self.value_dict['company_zch'] = company_zch_list[0].strip() if company_zch_list else None
     module.appendOutput(type=OutputType.FUNCTION, function=setCompanyZch)
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5, redo_module='home_page'))
     def getGdxxParms(html):
         return  re.findall(r'\,"recid":"(.+?)",', html, re.S) if html else []
     module.appendOutput(name="recid_list", type=OutputType.FUNCTION, function=getGdxxParms, show_up=OutputParameterShowUpType.OPTIONAL) #提取股东详情的list
     #module.appendOutput(type=OutputType.FUNCTION, function=pri_com)
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module_super.appendSubModule(module, True)
Example #13
0
 def initSearchPage(self, module_super):
     module = Module(self.visitSearchList, u"搜索列表-翻页")
     module.appendUrl('https://www.sgs.gov.cn/notice/search/ent_info_list')
     module.appendHeaders({
         'Host':
         'www.sgs.gov.cn',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language':
         'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept-Encoding':
         'gzip, deflate',
         'Referer':
         'https://www.sgs.gov.cn/notice/home',
         'Content-Type':
         'application/x-www-form-urlencoded'
     })
     module.appendWebMethod("post")
     module.appendPostData(
         lambda token, company_key, page_no: {
             'searchType': '1',
             'captcha': 0,
             'session.token': token,
             'condition.keyword': company_key,
             'condition.pageNo': page_no
         })
     module.appendOutput("search_list", './/div[@class="list-item"]',
                         OutputType.LIST)
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=2))
     module_super.appendSubModule(module)
Example #14
0
    def checkValidateCode(self):
        module = Module(self.getJson, u"检验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'X-Requested-With':
            'XMLHttpRequest',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "code": yzm,
            "textfield": company_key
        })

        def checkValidatecode(web=None):
            if not web:
                return False
            else:
                pattern = re.compile(r'\"([\s\S]*?)\"')
                flags = pattern.findall(str(web.body))
                if (len(flags) != 4 or flags[2] != 'textfield') or (
                        flags[0] == 'flag' and flags[1] != str(1)):
                    self.holder.logging.warning(u"验证码校验失败!")
                    return False
                else:
                    self.value_dict["textfield"] = flags[3].decode(
                        'raw_unicode_escape')
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
Example #15
0
 def initConfigValidateCode(self):
     module = Module(self.visitValidateCode, "验证码")
     module.module_id = "module_validate_code"
     module.appendUrl("http://gsxt.gdgs.gov.cn/aiccips/verify.html?random=" + str(random.random()))
     module.appendHeaders(
         {'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate',
          'Connection': 'keep-alive', 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5',
          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0',
          'Host': 'gsxt.gdgs.gov.cn', 'Referer': 'http://gsxt.gdgs.gov.cn/aiccips/'})
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
     self.module_manager.appendSubModule(module)
Example #16
0
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Cache-Control':
                'max-age=0',
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'Accept-Encoding':
                'gzip, deflate',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
Example #17
0
    def initConfigBaseInfo(self, module_super):
        module = Module(self.visitJbxx, u"基本信息")

        def prepare(company_url):
            query_ = {}
            for qq in map(lambda x: x.split("="),
                          urlparse.urlparse(company_url).query.split("&")):
                query_[qq[0]] = qq[1]
            return query_

        module.appendInput(InputType.FUNCTION, prepare)

        def assertReqArgs(zch):  #断言参数是否合法
            return True if zch else False

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=5,
                  assert_function=assertReqArgs,
                  redo_module="module_validate_code"))
        module.appendOutput(name='company_zch',
                            type=OutputType.FUNCTION,
                            function=lambda zch: zch.strip(),
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendUrl(
            lambda qyid, zch, qylx:
            "http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/gsgs_viewDjxx.pt?qyid=%s&zch=%s&qylx=%s&num=undefined&showgdxx=true"
            % (qyid, zch, qylx))
        module.appendHeaders(
            lambda ua, qylx, qyid, zch: {
                'Accept-Language':
                'en-US,en;q=0.5',
                'Accept-Encoding':
                'gzip, deflate',
                'Connection':
                'keep-alive',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'User-Agent':
                ua,
                'Referer':
                'http://gsxt.jxaic.gov.cn/ECPS/ccjcgs/ccjcgs_ccjcgsIndexDetail.pt?qylx=%s&qyid=%s&zch=%s&tabName=1'
                % (qylx, qyid, zch),
                'Host':
                'gsxt.jxaic.gov.cn'
            })
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=5,
                  redo_module="module_validate_code"))
        module_super.appendSubModule(module, True)
Example #18
0
    def getAnnalsInfo(self, module_super):
        """
        遍历年报列表
        :param module_super:
        :return:
        """
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://218.95.241.36' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Example #19
0
 def initYzmPic(self):
     module = Module(self.visitValidateCode, u'获取验证码图片')
     module.appendUrl('http://218.57.139.24/securitycode')
     module.appendHeaders(lambda ua:{
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive"
     })
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     self.module_manager.appendSubModule(module, True)
Example #20
0
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
Example #21
0
 def initHomePage(self):
     module = Module(self.visitHomePage, u"访问首页")
     module.module_id = "home_page"
     module.appendUrl('http://218.57.139.24')
     module.appendHeaders(lambda ua: {
         "Host": '218.57.139.24',
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "en-US,en;q=0.8",
         "Accept-Encoding": "gzip, deflate, sdch",
         "Connection": "keep-alive",
         'User-Agent': ua})
     module.addSleep(Sleep(2))
     module.appendOutput("csrf", '//form[@id="searchform"]/input[@name="_csrf"]/@value', OutputType.LIST)
     module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     self.module_manager.appendSubModule(module, True)
Example #22
0
    def getAnnalsInfo(self, module_super):
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://gsxt.hljaic.gov.cn' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Example #23
0
    def initCompanyInfoPrepare(self, module_super):
        module = Module(None, u"抓取公司前的预处理")

        def prepare(com):
            query_ = {}
            if com and len(com) >= 2 and com[0].strip() and com[1].strip():
                query_["company_url"] = com[0].strip()
                query_["company_name"] = com[1].strip()  #修改公司名的key值
            return query_

        module.appendOutput(type=OutputType.FUNCTION, function=prepare)
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=5))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=5,
                  redo_module="module_validate_code"))
        module_super.appendSubModule(module, True)
Example #24
0
 def initQynbInfo(self, module_super):
     module = Module(None, u"设置年份")
     module.appendOutput(type=OutputType.FUNCTION, function=lambda qynb_tuper: {'nb_name':qynb_tuper[0]})
     module_super.appendSubModule(module, True)
     module = Module(self.visitQynb, u"获取企业年报")
     module.appendUrl(lambda qynb_tuper:"http://218.57.139.24%s" % qynb_tuper[1])
     module.appendHeaders(lambda ua, com:{
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Host": "218.57.139.24",
         "Referer": "http://218.57.139.24/pub/qygsdetail/%s/%s" % (com[3], com[2]),
         "Connection": "keep-alive"})
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
     module.addSleep(Sleep(2))
     module_super.appendSubModule(module, True)
Example #25
0
    def initConfigCompanyInfoPre(self, module_super):
        module = Module(None, u"抓取公司前的预处理")
        module.module_id = "fetch_company_info"

        def setComParms(tag_a):
            query_ = {
                'company_name': tag_a[0],
                'entbigtype': tag_a[1],
                'pripid': tag_a[2]
            }
            return query_

        module.appendOutput(type=OutputType.FUNCTION,
                            function=setComParms,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module_super.appendSubModule(module, True)
Example #26
0
 def initConfigValidateCode(self):
     module = Module(self.visitValidateCode, u"验证码")
     module.module_id = "module_validate_code"
     module.appendUrl(
         'http://gsxt.jxaic.gov.cn/ECPS/common/common_getJjYzmImg.pt?yzmName=searchYzm&imgWidth=180&t='
         + str(random.random()))
     module.appendHeaders({
         'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4',
         'Accept-Encoding': 'gzip, deflate, sdch',
         'Connection': 'keep-alive',
         'Accept': 'image/webp,*/*;q=0.8',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
         'Host': 'gsxt.jxaic.gov.cn',
         'Referer': 'http://gsxt.jxaic.gov.cn/'
     })
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
     self.module_manager.appendSubModule(module, True)
Example #27
0
    def checkValidateCode(self):
        """
        对上一个模块产生的验证码进行校验
        :return:
        """
        module = Module(self.getJson, u"校验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl('http://218.95.241.36/checkCheckNo.jspx')
        module.appendHeaders({
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Proxy-Connection": "keep-alive",
            "Host": "218.95.241.36",
            "Origin": "http://218.95.241.36",
            "Referer": "http://218.95.241.36/search.jspx",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm: {"checkNo": yzm})

        def checkValidatecode(json=None):
            if not json or "{success:true}" not in json:
                self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码")
                return False
            else:
                return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
Example #28
0
    def initQynbInfo(self, module_super):
        module = Module(None, u"设置年份")

        def saveNbyear(qynb_year):
            if qynb_year and qynb_year.strip():
                self.value_dict['nb_name'] = qynb_year.strip()

        module.appendOutput(type=OutputType.FUNCTION, function=saveNbyear)
        module_super.appendSubModule(module, True)

        module = Module(self.visitQynb, u"获取企业年报")
        module.appendUrl("http://gsxt.scaic.gov.cn/ztxy.do")
        module.appendHeaders(
            lambda ua: {
                "User-Agent": ua,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate",
                "Host": "gsxt.scaic.gov.cn",
                "Referer": "http://gsxt.scaic.gov.cn/ztxy.do",
                "Connection": "keep-alive"
            })
        module.appendWebMethod("post")
        module.appendPostData(
            lambda pripid, qynb_year: {
                'maent.nd': qynb_year.strip(),
                'maent.pripid': pripid,
                'method': 'ndbgDetail',
                'random': str(int(time.time() * 1000))
            })
        module.appendOutput(
            "nb_zch", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[1]/text()',
            OutputType.LIST)  #, show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(
            "nb_qym", '//div[@id="qufenkuang"]/table[1]/tr[3]/td[2]/text()',
            OutputType.LIST)  #, show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=5))  #粗略检测年报信息有没有注册号和企业名(没有可能会因为访问太快,没数据回来)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5))
        module.addSleep(Sleep(2))
        module_super.appendSubModule(module, True)
Example #29
0
 def initXzcfxxInfo(self, module_super):
     module = Module(self.visitXzcfJson, u"获取行政处罚信息")
     module.appendUrl("http://218.57.139.24/pub/gsxzcfxx")
     module.appendHeaders(lambda com, csrf, ua:{
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "application/json, text/javascript, */*; q=0.01",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
         'Referer':'http://218.57.139.24/pub/'+com[1],
         'X-CSRF-TOKEN':csrf[0],
         'X-Requested-With':'XMLHttpRequest'})
     module.appendWebMethod("post")
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module.appendPostData(lambda com: {'encrpripid': com[2]})
     module_super.appendSubModule(module, True)
Example #30
0
 def initGdxqInfoPrepare(self, module_super):
     iterator = Iterator("recid_list", "rid")
     module = Module(None, u"进入股东详情", iterator)
     module_super.appendSubModule(module, True)
     sub_module = Module(self.visitGdxq, u"获取股东详情")
     # def pri_c(rid, com):
     #     print 'xxxxxx===>>>', rid
     # sub_module.appendOutput(type=OutputType.FUNCTION, function=pri_c)
     sub_module.appendUrl(lambda rid, com: 'http://218.57.139.24/pub/gsnzczxxdetail/%s/%s'%(com[2], rid.strip()))
     sub_module.appendHeaders(lambda ua, com: {
         "Host": "218.57.139.24",
         "User-Agent": ua,
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Accept-Encoding": "gzip, deflate",
         "Connection": "keep-alive",
         'Referer':'http://218.57.139.24/pub/'+com[1],})
     module.addSleep(Sleep(2))
     module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=5, redo_module='home_page'))
     module.appendSubModule(sub_module, True)