Exemple #1
0
    def initConfigValidateCode(self):
        module = Module(self.visitValidateCode, u"获取验证码")
        module.module_id = "init_validate_code"

        module.appendUrl(
            "http://gsxt.hljaic.gov.cn/validateCode.jspx?type=0&id=" +
            str(random.random()))
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "image/png,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")

        def checkValidatecode(yzm):
            if not yzm:
                self.holder.logging.warning(u"获取验证码失败")
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode))

        self.module_manager.appendSubModule(module, True)
Exemple #2
0
    def initToken(self):
        module = Module(self.getWebHtml, u"令牌获取")
        module.module_id = "module_token"
        module.appendUrl('https://www.sgs.gov.cn/notice/search/popup_captcha')
        module.appendHeaders({
            'Host': 'www.sgs.gov.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://www.sgs.gov.cn/notice/home'
        })
        module.appendEncoding("utf-8")

        def getToken(html):
            if not html or '\"session.token\": \"' not in html:
                self.holder.logging.error(u'获取session.token失败!')
                return None
            token = re.search(r'\"session\.token\": \"(.*?)\"', html).group(1)
            if not token:
                self.holder.logging.error(u'提取token失败!')
            self.holder.logging.info('token: %s' % token)
            return token

        module.appendOutput(name="token",
                            type=OutputType.FUNCTION,
                            function=getToken)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))
        self.module_manager.appendSubModule(module)
Exemple #3
0
    def initConfigValidateCode(self):
        module = Module(self.visitValidateCode, u"获取验证码")
        module.module_id = "init_validate_code"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/verify.html?random=" +
            str(random.random()))
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Accept':
            'image/webp,*/*;q=0.8',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate, sdch',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")

        def checkValidatecode(yzm):
            if not yzm:
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode))

        self.module_manager.appendSubModule(module, True)
Exemple #4
0
    def getCmpnySereachList(self):
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/showInfo.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, textfield: {
            "code": yzm,
            "textfield": textfield.replace(r"\n", "")
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
Exemple #5
0
    def checkValidateCode(self):
        module = Module(self.getJson, u"检验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/CheckEntContext/checkCode.html"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'X-Requested-With':
            'XMLHttpRequest',
            'Content-Type':
            'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer':
            'http://www.nmgs.gov.cn:7001/aiccips/',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendWebMethod("post")
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "code": yzm,
            "textfield": company_key
        })

        def checkValidatecode(web=None):
            if not web:
                return False
            else:
                pattern = re.compile(r'\"([\s\S]*?)\"')
                flags = pattern.findall(str(web.body))
                if (len(flags) != 4 or flags[2] != 'textfield') or (
                        flags[0] == 'flag' and flags[1] != str(1)):
                    self.holder.logging.warning(u"验证码校验失败!")
                    return False
                else:
                    self.value_dict["textfield"] = flags[3].decode(
                        'raw_unicode_escape')
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
Exemple #6
0
    def getCmpnySereachList(self):
        """
        抓取公司列表
        :output: url_list, name_list, search_list
        :return:
        """
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl("http://218.95.241.36/searchList.jspx")
        module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Origin": "http://218.95.241.36",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": "http://218.95.241.36/search.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "checkNo": yzm,
            "entName": company_key
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
Exemple #7
0
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Cache-Control':
                'max-age=0',
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'Accept-Encoding':
                'gzip, deflate',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
Exemple #8
0
    def getAnnalsInfo(self, module_super):
        """
        遍历年报列表
        :param module_super:
        :return:
        """
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://218.95.241.36' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Exemple #9
0
    def getStockholderInfo(self, module_super):
        """
        抓取翻页的股东信息
        :param module_super:
        :return:
        """
        module = Module(self.visitGdxx, u"抓取股东信息")
        module.module_id = "get_stockholder_info"

        module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryInvList.jspx?pno=%s&mainId=%s" %
            (pno, company_id))
        module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getGdxqList(html):
            query_dict = dict()
            try:
                tree = etree.HTML(html)
                query_dict["gdxq_list"] = tree.xpath(
                    ".//*[@class='detailsList']/tr/td/a/@onclick")
            except Exception as e:
                self.holder.logging.warning(u"获取股东翻页中的股东详情列表失败: %s" % e)
                query_dict = dict()
            return query_dict

        module.appendOutput(type=OutputType.FUNCTION,
                            function=getGdxqList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module_super.appendSubModule(module, True)
Exemple #10
0
    def getCmpnySereachList(self):
        module = Module(self.visitSearchList, u"抓取公司列表")
        module.module_id = "get_search_list"

        module.appendUrl("http://gsxt.hljaic.gov.cn/searchList.jspx")
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "text/plain, */*; q=0.01",
            "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Content-Type": "application/x-www-form-urlencoded",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm, company_key: {
            "checkNo": yzm,
            "entName": company_key
        })
        module.addSleep(Sleep(3))

        module.appendOutput("url_list", ".//*[@class='list']/ul/li/a/@href",
                            OutputType.LIST)
        module.appendOutput("name_list", ".//*[@class='list']/ul/li/a/text()",
                            OutputType.LIST)
        module.appendOutput(
            name="search_list",
            type=OutputType.FUNCTION,
            function=lambda url_list, name_list: zip(url_list, name_list))

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=0,
                  assert_function=lambda: False if self.report.access_type ==
                  SeedAccessType.NON_COMPANY else True))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module)
Exemple #11
0
    def getAnnalsInfo(self, module_super):
        iterator = Iterator("annals_list", "annals")
        module = Module(None, u"遍历年报列表", iterator)
        module_super.appendSubModule(module, True)

        sub_module = Module(self.visitQynb, u"抓取年报详情")
        sub_module.module_id = "get_annals_info"

        def prepareParams(annals):
            mv_dict = dict()
            if annals and len(annals) >= 2:
                mv_dict['nb_url'] = annals[0]
                mv_dict['nb_name'] = annals[1]
            return mv_dict

        sub_module.appendInput(InputType.FUNCTION, input_value=prepareParams)

        def getURL(nb_url=None):
            if nb_url:
                return u'http://gsxt.hljaic.gov.cn' + nb_url
            return None

        sub_module.appendUrl(getURL)

        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_info"))

        module.appendSubModule(sub_module)
Exemple #12
0
    def checkValidateCode(self):
        """
        对上一个模块产生的验证码进行校验
        :return:
        """
        module = Module(self.getJson, u"校验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl('http://218.95.241.36/checkCheckNo.jspx')
        module.appendHeaders({
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8",
            "Proxy-Connection": "keep-alive",
            "Host": "218.95.241.36",
            "Origin": "http://218.95.241.36",
            "Referer": "http://218.95.241.36/search.jspx",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm: {"checkNo": yzm})

        def checkValidatecode(json=None):
            if not json or "{success:true}" not in json:
                self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码")
                return False
            else:
                return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
Exemple #13
0
    def fetchCmpnyGdxq(self, module_super):
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        # TODO: 添加try exception
        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
Exemple #14
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return u"http://gsxt.hljaic.gov.cn" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED,
                                  retry_times=100))

        module.appendSubModule(sub_module)
Exemple #15
0
    def getCompanyRecordInfo(self, module_super):
        module = Module(self.visitBaxx, u"抓取备案信息")
        module.module_id = "get_record_info"

        module.appendUrl(
            "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entCheckInfo"
        )
        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Referer':
            "http://www.nmgs.gov.cn:7001/aiccips/GSpublicity/GSpublicityList.html?service=entInfo",
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))
        module.appendWebMethod("post")
        module.appendPostData(
            lambda params_list: {
                "entNo": str(params_list[1]),
                "entType": str(params_list[2]),
                "regOrg": str(params_list[3])
            })

        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))

        module_super.appendSubModule(module, True)
Exemple #16
0
    def initConfigValidateCode(self):
        """
        获取验证码并检查是否产生验证码结果
        :return:
        """
        module = Module(self.visitValidateCode, u"获取验证码")
        module.module_id = "init_validate_code"

        module.appendUrl('http://218.95.241.36/validateCode.jspx?type=0')
        module.appendHeaders({
            "Accept":
            "image/webp,image/*,*/*;q=0.8",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "Proxy-Connection":
            "keep-alive",
            "Host":
            '218.95.241.36',
            "Referer":
            "http://218.95.241.36/search.jspx",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
        })
        module.addSleep(Sleep(3))
        module.appendEncoding("utf-8")

        def checkYZM(yzm=None):
            if not yzm:
                self.holder.logging.warning(u"获取验证码失败")
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkYZM))

        self.module_manager.appendSubModule(module, True)
Exemple #17
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://218.95.241.36/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s" %
            (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                'Host':
                '218.95.241.36',
                'Proxy-Connection':
                'keep-alive',
                'Accept':
                '*/*',
                'Referer':
                'http://218.95.241.36/businessPublicity.jspx?id=' + str(
                    company_id),
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0",
                'Accept-Encoding':
                'gzip, deflate, sdch',
                "Accept-Language":
                "zh-CN,zh;q=0.8"
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Exemple #18
0
    def fetchStockholderDetail(self, module_super):
        """
        遍历股东详情
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="gdxq_list", param_name="gdxq")
        module = Module(iterator=iterator, name=u"遍历股东详情")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitGdxq, u"抓取股东详情")

        def getURL(gdxq):
            if gdxq:
                gdxq_text = re.findall(r"(?<=\(').+?(?='\))", gdxq)
                if gdxq_text:
                    return "http://218.95.241.36" + gdxq_text[0]
            return None

        sub_module.appendUrl(getURL)
        sub_module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Exemple #19
0
 def initConfigBaseInfo(self, module_super):
     module = Module(self.visitJbxx, u"基本信息")
     module.appendUrl("company_url")
     module.appendHeaders({
         'Host':
         'www.sgs.gov.cn',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Language':
         'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
         'Accept-Encoding':
         'gzip, deflate',
         'Referer':
         'https://www.sgs.gov.cn/notice/search/ent_info_list'
     })
     module.appendEncoding("utf-8")
     module.appendOutput(name="gdxq_list",
                         xpath=".//*[@id='investorTable']//td/a/@href",
                         type=OutputType.LIST,
                         show_up=OutputParameterShowUpType.OPTIONAL)
     module_super.appendSubModule(module, True)
Exemple #20
0
    def fetchPunishInfo(self, module_super):
        """
        遍历行政处罚
        :param module_super:
        :return:
        """
        iterator = Iterator(seeds="xzcf_page_range", param_name="pno")
        module = Module(iterator=iterator, name=u"遍历行政处罚翻页")
        module_super.appendSubModule(module)

        sub_module = Module(self.visitXzcf, u"抓取行政处罚信息")
        sub_module.appendUrl(
            lambda pno, company_id:
            "http://gsxt.hljaic.gov.cn/QueryPunList.jspx?pno=%s&mainId=%s&ran=%s"
            % (pno, company_id, str(random.random())))
        sub_module.appendHeaders(
            lambda company_id: {
                "Host":
                "gsxt.hljaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                'Referer':
                'http://gsxt.hljaic.gov.cn/businessPublicity.jspx?id=' + str(
                    company_id),
            })
        sub_module.appendEncoding("utf-8")
        sub_module.addSleep(Sleep(3))

        sub_module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=50))

        module.appendSubModule(sub_module)
Exemple #21
0
    def checkValidateCode(self):
        module = Module(self.getJson, u"校验验证码")
        module.module_id = "check_validate_code"

        module.appendUrl('http://gsxt.hljaic.gov.cn/checkCheckNo.jspx')
        module.appendHeaders({
            "Host": "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept": "image/png,image/*;q=0.8,*/*;q=0.5",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Referer": "http://gsxt.hljaic.gov.cn/search.jspx"
        })
        module.appendWebMethod("post")
        module.appendEncoding("utf-8")
        module.appendPostData(lambda yzm: {"checkNo": yzm})

        def checkValidatecode(json=None):
            if not json or "{success:true}" not in json:
                self.holder.logging.warning(u"验证码校验失败, 需要重新获取验证码")
                return False
            else:
                return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=100,
                  assert_function=checkValidatecode,
                  redo_module="init_validate_code"))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="init_validate_code"))

        self.module_manager.appendSubModule(module, True)
Exemple #22
0
    def getAnnalsList(self, module_super):
        """
        抓取年报列表
        :param module_super:
        :return:
        """
        module = Module(self.getWebHtml, u"抓取年报列表")
        module.module_id = "get_annals_list"

        def prepareParams(company_url):
            query_dict = {}
            if company_url:
                query_dict["url_id"] = company_url.split("=")[1]
            return query_dict

        module.appendInput(InputType.FUNCTION, prepareParams)

        def getURL(url_id=None):
            if url_id:
                return u'http://218.95.241.36/enterprisePublicity.jspx?id=' + url_id
            return None

        module.appendUrl(getURL)

        module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getAnnalsList(html=None):
            qynb_list = []

            try:
                tree = etree.HTML(html)
                _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a")

                for ll in _list:
                    url = ''.join(ll.xpath('@href')).strip()
                    name = ''.join(ll.xpath('text()')).replace(u'年度报告', '')
                    if name != u'详情':
                        qynb_list.append([url, name])
            except Exception as e:
                self.holder.logging.warning(u"获取annals_list失败: %s" % e)
                qynb_list = []
            return qynb_list

        module.appendOutput(name='annals_list',
                            type=OutputType.FUNCTION,
                            function=getAnnalsList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))

        module_super.appendSubModule(module, True)
Exemple #23
0
    def getAnnalsList(self, module_super):
        module = Module(self.getWebHtml, u"抓取年报列表")
        module.module_id = "get_annals_list"

        def prepareParams(company_url):
            query_dict = {}
            if company_url:
                query_dict["url_id"] = company_url.split("=")[1]
            return query_dict

        module.appendInput(InputType.FUNCTION, prepareParams)

        def getURL(url_id=None):
            if url_id:
                return u'http://gsxt.hljaic.gov.cn/enterprisePublicity.jspx?id=' + url_id
            return None

        module.appendUrl(getURL)

        module.appendHeaders({
            "Host":
            "gsxt.hljaic.gov.cn",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; rv:37.0) Gecko/20100101 Firefox/37.0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language":
            "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding":
            "gzip, deflate",
            "Referer":
            "http://gsxt.hljaic.gov.cn/searchList.jspx"
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getAnnalsList(html=None):
            qynb_list = []

            try:
                tree = etree.HTML(html)
                _list = tree.xpath(".//*[@id='qiyenianbao']/table/tr/td/a")

                for ll in _list:
                    url = ''.join(ll.xpath('@href')).strip()
                    name = ''.join(ll.xpath('text()')).replace(u'年度报告', '')
                    if name != u'详情':
                        qynb_list.append([url, name])
            except:
                qynb_list = []
            return qynb_list

        module.appendOutput(name='annals_list',
                            type=OutputType.FUNCTION,
                            function=getAnnalsList,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_annals_list"))

        module_super.appendSubModule(module, True)
Exemple #24
0
    def getCompanyInfo(self, module_super):
        module = Module(self.visitJbxx, u"抓取公司信息")
        module.module_id = "get_cmpny_info"

        def getURL(company_url):
            if "http" in company_url:
                return company_url
            else:
                return u"http://www.nmgs.gov.cn:7001/aiccips" + company_url.replace(
                    '..', '')

        module.appendUrl(getURL)

        module.appendHeaders({
            'Host':
            'www.nmgs.gov.cn:7001',
            'Connection':
            'keep-alive',
            'Cache-Control':
            'max-age=0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Origin':
            'http://www.nmgs.gov.cn:7001',
            'Content-Type':
            'application/x-www-form-urlencoded',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'en-US,en;q=0.8,zh-CN;q=0.6,zh-TW;q=0.4'
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        module.appendOutput(name="gdxq_list",
                            xpath=".//*[@id='invInfo']/table/tr/td/a/@onclick",
                            type=OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(name="params_list",
                            xpath='.//input[@type=\'hidden\']/@value',
                            type=OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        module.addEvent(Event(EventType.EXCEPTION_OCCURED, retry_times=100))
        module.addEvent(Event(EventType.OUTPUT_NOT_SATISFIED, retry_times=100))

        module_super.appendSubModule(module, True)

        def checkParmsList(params_list=None):
            if params_list:
                return False
            return True

        module.appendBypass(
            Bypass(condition_fuc=checkParmsList,
                   module_id="get_record_info",
                   range_global=True))
        module.appendBypass(
            Bypass(condition_fuc=checkParmsList,
                   module_id="get_punish_info",
                   range_global=True))
Exemple #25
0
    def getCompanyInfo(self, module_super):
        """
        抓取公司信息, 并获取各个模块的翻页总页码
        :param module_super:
        :output: company_id, gdxx_page_range, bgxx_page_range, baxx_page_range, fzjg_page_range, xzcf_page_range
        """
        module = Module(self.visitJbxx, u"抓取公司信息")
        module.module_id = "get_cmpny_info"

        def getURL(company_url=None):
            if "http" in company_url:
                return company_url
            else:
                return u'http://218.95.241.36' + company_url

        module.appendUrl(getURL)

        module.appendHeaders({
            "Host": "218.95.241.36",
            "Proxy-Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36",
            "Referer": "http://218.95.241.36/searchList.jspx",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.8"
        })
        module.appendEncoding("utf-8")
        module.addSleep(Sleep(3))

        def getCompanyID(company_url):
            query_dict = dict()

            try:
                query_dict["company_id"] = re.search(
                    r'\?id\=([\s\S]*)', company_url).group().split("=")[1]
            except Exception as e:
                self.holder.logging.warning(u"获取company_id失败: %s" % e)
                query_dict = dict()

            return query_dict

        module.appendOutput(name="gdxq_list",
                            xpath=".//*[@id='invDiv']/table/tr/td/a/@onclick",
                            type=OutputType.LIST,
                            show_up=OutputParameterShowUpType.OPTIONAL)
        module.appendOutput(type=OutputType.FUNCTION, function=getCompanyID)

        # 股东信息页码获取
        def getGdxxPageno(html):
            try:
                fenye_xpath = ".//div[@id='invDiv']/following-sibling::table[1]|.//div[@id='invPagination']/table[1]"
                gdxx_tree = etree.HTML(html)
                fenye_table = gdxx_tree.xpath(fenye_xpath)
                if not fenye_table:
                    return []
                fenye_table = fenye_table[0]
                pageno = self.parse_total_pg(fenye_table)
                if pageno <= 1:
                    return []
                self.holder.logging.info(u"------------------股东信息页码: " +
                                         str(pageno) +
                                         u"---------------------")
                return range(2, int(pageno) + 1)
            except Exception as e:
                self.holder.logging.warning(u"获取股东信息页码失败: %s" % e)
                return []

        module.appendOutput(name="gdxx_page_range",
                            type=OutputType.FUNCTION,
                            function=getGdxxPageno,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        # 变更信息页码获取
        def getBgxxPageno(html):
            try:
                fenye_xpath = ".//div[@id='altDiv']/following-sibling::table[1]|.//div[@id='altPagination']/table[1]"
                bgxx_tree = etree.HTML(html)
                fenye_table = bgxx_tree.xpath(fenye_xpath)
                if not fenye_table:
                    return []
                fenye_table = fenye_table[0]
                pageno = self.parse_total_pg(fenye_table)
                if pageno <= 1:
                    return []
                self.holder.logging.info(u"------------------变更信息页码: " +
                                         str(pageno) +
                                         u"---------------------")
                return range(2, int(pageno) + 1)
            except Exception as e:
                self.holder.logging.warning(u"获取变更信息页码失败: %s" % e)
                return []

        module.appendOutput(name="bgxx_page_range",
                            type=OutputType.FUNCTION,
                            function=getBgxxPageno,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        # 备案信息页码获取
        def getBaxxPageno(html):
            try:
                fenye_xpath = ".//div[@id='memDiv']/following-sibling::table[1]|.//*[@id='beian']/table[2]"
                baxx_tree = etree.HTML(html)
                fenye_table = baxx_tree.xpath(fenye_xpath)
                if not fenye_table:
                    return []
                fenye_table = fenye_table[0]
                pageno = self.parse_total_pg(fenye_table)
                if pageno <= 1:
                    return []
                self.holder.logging.info(u"------------------备案信息页码: " +
                                         str(pageno) +
                                         u"---------------------")
                return range(2, int(pageno) + 1)
            except Exception as e:
                self.holder.logging.warning(u"获取备案信息页码失败: %s" % e)
                return []

        module.appendOutput(name="baxx_page_range",
                            type=OutputType.FUNCTION,
                            function=getBaxxPageno,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        # 分支机构页码获取
        def getFzjgPageno(html):
            try:
                fenye_xpath = ".//div[@id='childPagination']/table[1]|.//div[@id='childDiv']/following-sibling::table[1]"
                fzfg_tree = etree.HTML(html)
                fenye_table = fzfg_tree.xpath(fenye_xpath)
                if not fenye_table:
                    return []
                fenye_table = fenye_table[0]
                pageno = self.parse_total_pg(fenye_table)
                if pageno <= 1:
                    return []
                self.holder.logging.info(u"------------------分支机构信息页码:" +
                                         str(pageno) +
                                         u"---------------------")
                return range(2, int(pageno) + 1)
            except Exception as e:
                self.holder.logging.warning(u"获取分支机构页码失败: %s" % e)
                return []

        module.appendOutput(name="fzjg_page_range",
                            type=OutputType.FUNCTION,
                            function=getFzjgPageno,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        # 行政处罚页码获取
        def getXzcfPageno(html):
            try:
                fenye_xpath = ".//*[@id='xingzhengchufa']/table[2]"
                xzcf_tree = etree.HTML(html)
                fenye_table = xzcf_tree.xpath(fenye_xpath)
                if not fenye_table:
                    return []
                fenye_table = fenye_table[0]
                pageno = self.parse_total_pg(fenye_table)
                if pageno <= 1:
                    return []
                self.holder.logging.info(u"------------------行政处罚信息页码:" +
                                         str(pageno) +
                                         u"---------------------")
                return range(2, int(pageno) + 1)
            except Exception as e:
                self.holder.logging.warning(u"获取行政处罚页码失败: %s" % e)
                return []

        module.appendOutput(name="xzcf_page_range",
                            type=OutputType.FUNCTION,
                            function=getXzcfPageno,
                            show_up=OutputParameterShowUpType.OPTIONAL)

        def checkCompnayID(company_id=None):
            if not company_id:
                self.holder.logging.warning(u"company_id无效")
                return False
            return True

        module.addEvent(
            Event(EventType.ASSERT_FAILED,
                  retry_times=10,
                  assert_function=checkCompnayID))
        module.addEvent(
            Event(EventType.EXCEPTION_OCCURED,
                  retry_times=100,
                  redo_module="get_cmpny_info"))
        module.addEvent(
            Event(EventType.OUTPUT_NOT_SATISFIED,
                  retry_times=100,
                  redo_module="get_cmpny_info"))

        module_super.appendSubModule(module, True)