Example #1
0
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = False
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("proxy_032512.txt")
            Spider.__init__(self, len(self.proxies_dict))

        self._aes_ = CCIQ_AES()
        #成功的
        self.query_success = FileSaver("c_query_detail.txt")
        #失败的
        self.query_failure = FileSaver("c_query_detail_failure.txt")
        #已经爬取过的
        self.already_cname_list = FileSaver("c_already_detail.txt")
        #初始化已经爬过的公司
        self.init_cname()

        #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"}))
        # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="
        # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        self.bloom = set()

        self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="]

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
        self.is_first = True
        self.init_time = 0
Example #2
0
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("proxy_20160218.txt")
        Spider.__init__(self, len(self.proxies_dict))

        self.num_count = 0
        #self.filter_name = []
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_info = FileSaver("query_company_info.txt")
        #根据公司名字查询到的公司列表局部信息
        self.query_company_info_part = FileSaver("query_company_info_part.txt")
        #根据公司名字查询到的公司列表信息失败的
        self.query_company_info_failure = FileSaver(
            "query_company_info_failure.txt")
        #已经爬取过的公司名
        self.already_cname = FileSaver("already_cname.txt")
        #初始化已经爬过的公司
        self.init_cname()
        #查询详情失败的公司名
        self.detail_failure = FileSaver("detail_failure1.txt")
        #APP可以拿到的公司全部信息 包含股东信息
        self.detail_company = FileSaver("detail_company.txt")
        self.extJson = self._aes_.encrypt(
            spider.util.utf8str({
                "cl_screenSize": "640x960",
                "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
                "Org_iOS_Version": "2.0.1"
            }))
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
Example #3
0
    def __init__(self):
        self.is_debug = False
        self._can_use_proxy_num = 0
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self.error_cnt = 0
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_list = FileSaver("all_company_list.txt")

        #已经爬取过的公司名
        self.already_cname_list = FileSaver("all_company_list_already.txt")

        #爬过的 错误类型
        self.already_error_type = FileSaver("all_already_error_type.txt")

        #初始化已经爬过的公司
        self.init_cname()
        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]

        self.bloom = set()
Example #4
0
    def get_detail(self, cname, code, area):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "bl_oc_code": code,  #"71526726X"
            "v1": "QZOrgV004",
            "isDirect": "1",
            "bl_oc_name": cname,  #"腾讯科技"
            "bl_oc_area": area  #"4403"
        }

        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- encryptedJson -->', str(encryptedJson)
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 404:
            print "404 ------ ", code
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            self.get_detail(cname, code, area)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------code ', code, ' res.text is null----------------------------'
                self.detail_failure.append(cname + "|" + str(code) + "|" +
                                           str(area))
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            detail = eval(result)
            listGD = self.get_gd(area, code)
            if listGD is not None:
                detail['listGD'] = listGD['listGD']
            print 'detail=================================', spider.util.utf8str(
                detail)
            self.detail_company.append(spider.util.utf8str(detail))
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
Example #5
0
    def get_branch(self,cname, now_page=1, list_branch=[], retry=0):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName" : cname,
            "v1" : "QZOrgV005",
            "page" : now_page,
            "pagesize" : "10"
        }

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_branch --- cname=%s --- retry=%d --- reason:len(c)=0" % (cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
                temp = eval(result)
                if temp is not None:
                    for t in temp['Branch']:
                        list_branch.append(t)
                    if len(temp['Branch']) == 10:
                        if now_page > 3:
                            return list_branch
                        now_page += 1
                        print cname, "翻页 -----------------------------------> now_page", now_page
                        return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
                    else:
                        return list_branch
                else:
                    print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page)
                    return None
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- reason:%s" % (cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry*1.5)
                    return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
                else:
                    return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry*1.5)
                return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
            else:
                return None
Example #6
0
    def get_branch(self, cname, now_page, list_branch):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName": cname,
            "v1": "QZOrgV004",
            "page": now_page,
            "pagesize": "10"
        }
        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_branch ------ res is none ---->', cname, now_page
            return None
        elif res.code == 404:
            print "get_branch ------ 404 --- ", cname, now_page
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_branch ------ ', res.code, cname, now_page
            time.sleep(0.5)
            return self.get_branch(cname, now_page)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_branch------res.text is null----------------------------', cname, now_page
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            temp = eval(result)
            if temp is not None:
                for t in temp['Branch']:
                    list_branch['Branch'].append(t)
                if len(temp['Branch']) == 10:
                    now_page += 1
                    return self.get_branch(cname, now_page, list_branch)
                else:
                    return list_branch
            else:
                print 'get_branch------Branch is null----------------------------', cname, now_page
                return None
        else:
            print cname, "######## get_branch  ################   UNKNOWN ERROR   ######################", res.code
        return None
Example #7
0
 def __init__(self):
     #self.proxies_dict = []
     #self.read_proxy("../spider/proxy/proxy.txt")
     #Spider.__init__(self, len(self.proxies_dict))
     Spider.__init__(self, 1)
     self.num_count = 0
     self._aes_ = CCIQ_AES()
     #APP可以拿到的公司全部信息
     self.save_success = FileSaver("exist_company.txt")
     #APP可以拿到的公司局部信息
     self.part_success = FileSaver("part_company.txt")
     #查询失败的公司名
     self.fail_name = FileSaver("fail_name.txt")
Example #8
0
    def __init__(self):
        Spider.__init__(self, 20)
        self._aes_ = CCIQ_AES()

        #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)")
        self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt")


        self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="']

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
Example #9
0
    def get_gd(self, area, code):
        """
        获取股东信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"

        encryptedJson = {
            "bl_oc_area": area,  #4107
            "v1": "QZOrgV004",
            "bl_oc_code": code  #672867774
        }

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_gd ------ res is none -- get_gd code is -->', code
            return None
        elif res.code == 404:
            print "get_gd ------ 404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_gd ------ ', res.code, code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_gd ------', code, ' res.text is null----------------------------'
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None
Example #10
0
 def get_gd(self, code, retry=0):
     """
     获取股东信息
     """
     url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
     encryptedJson = {
         "bl_oc_area": "",
         "v1": "QZOrgV005",
         "bl_oc_code": code
     }
     res = self.req_all(url, encryptedJson)
     if res is None:
         return None
     if res.code == 200:
         try:
             c = eval(res.text)['c']
             if len(c) == 0:
                 print "get_gd --- retry=%d --- reason:len(c)=0" % retry
                 return None
             result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                 c)
             #print "获取股东信息结果:", spider.util.utf8str(result)
             return eval(result)
         except Exception as err:
             print "get_gd --- retry=%d --- reason:%s" % (retry, err)
             if retry < 5:
                 retry += 1
                 time.sleep(retry * 1.5)
                 return self.get_gd(code, retry=retry)
             else:
                 return None
     else:
         print "get_gd --- retry=%d --- res.code=%d" % (retry, res.code)
         if retry < 5:
             retry += 1
             time.sleep(retry * 1.5)
             return self.get_gd(code, retry=retry)
         else:
             return None
Example #11
0
    def get_inversted(self, url, encryptedJson):
        """
        通用请求方法
        """

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })

        res = self.request_url(url,
                               headers={"Content-Type": "application/json"},
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- search gd code is -->', code
            return None
        elif res.code == 404:
            print "404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            #print 'gd infos =======================',spider.util.utf8str(list_gd)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None
Example #12
0
    def get_inversted(self, cname):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {"input": cname, "v1": "QZOrgV004"}
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_inversted ------ res is none --', cname
            return None
        elif res.code == 404:
            print "get_inversted ------ 404 --- ", cname
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_inversted ------ ', res.code, cname
            time.sleep(0.5)
            return self.get_inversted(cname)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_inversted ------ ', cname, ' res.text is null----------------------------'
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_inversted = eval(result)
            return list_inversted
        else:
            print cname, "##############  get_inversted  ############   UNKNOWN ERROR   #################", res.code
        return None
Example #13
0
    def get_inversted(self, cname, retry=0):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {"input": cname, "v1": "QZOrgV005"}

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_inversted --- cname=%s --- retry=%d --- reason:len(c)=0" % (
                        cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                    c)
                return eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- reason:%s" % (
                    cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry * 1.5)
                    return self.get_inversted(cname, retry=retry)
                else:
                    return None
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (
                cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry * 1.5)
                return self.get_inversted(cname, retry=retry)
            else:
                return None
Example #14
0
    def get_inversted(self, cname, retry):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {
            "input" : cname,
            "v1" : "QZOrgV005"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 400 and res_code < 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None
        elif res_code >= 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            time.sleep(1)
            return self.get_inversted(cname, retry)
        elif res.code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print "get_inversted --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                list_inversted = eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print 'get_inversted --- eval(result) exception , result:\n', result
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            return list_inversted
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_inversted(cname, (retry+1))
            else:
                return None
Example #15
0
    def get_branch(self,cname, now_page, list_branch, retry):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName" : cname,
            "v1" : "QZOrgV005",
            "page" : now_page,
            "pagesize" : "10"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_branch(cname,now_page, list_branch, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 500:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            time.sleep(1)
            return self.get_branch(cname, now_page, list_branch, (retry+1))
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
                print "get_branch --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- len(c)=0" % (cname, retry, now_page , res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            temp = eval(result)
            if temp is not None:
                for t in temp['Branch']:
                    list_branch['Branch'].append(t)
                if len(temp['Branch']) == 10:
                    now_page += 1
                    # if now_page >= 10:
                    #     return list_branch
                    return self.get_branch(cname, now_page, list_branch, 0)
                else:
                    return list_branch
            else:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page , res_code)
                return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- UNKNOW ERROR" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(1)
                return self.get_branch(cname, now_page, list_branch, (retry+1))
            else:
                return None
Example #16
0
    def flip_over(self, now_page, cname, line, cnt, retry):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])
        if res is None:
            if self.get_fail_cnt(1) < 10:
                print "%d-----%s ------ res is None" % (cnt, cname)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                #self.query_company_info_failure.append(line)
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 404 or res.code == 403:
            if self.get_fail_cnt(1) < 20:
                print "%d-----%s ------ %d" % (cnt, cname, res.code)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d------%s ------ %d " % (cnt, cname, res.code)
            self.add_job({'line': line, 'cnt': cnt})
            time.sleep(1)
            return False
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                self.query_company_info_failure.append(line)
                return True
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                self.query_company_info_failure.append(line)
                return True
            print 'cname %s result ###################  list length ------ %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_info.append(spider.util.utf8str(aa))
                part = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.query_company_info_part.append(part)
                self.get_detail(l['oc_name'], l['oc_code'], l['oc_area'])
            if len(list) < 20:
                return True
            elif len(list) == 20:
                now_page += 1
                self.flip_over(now_page, cname, line, cnt)
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
            self.query_company_info_failure.append(line)
            return True
Example #17
0
 def get_gd(self, area, code, cname, retry):
     """
     获取股东信息
     """
     url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
     encryptedJson = {
         "bl_oc_area" : area, #4107
         "v1" : "QZOrgV005",
         "bl_oc_code" : code #672867774
     }
     res = self.req_all(url, encryptedJson)
     res_code = 0
     if res is None or (res.code >= 400 and res.code < 500):
         if res is not None:
             res_code = res.code
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
         if retry < 5:
             time.sleep(0.1)
             return self.get_gd(area, code, cname, (retry+1))
         else:
             return None
     res_code = res.code
     if res_code >= 500:
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
         time.sleep(1)
         return self.get_gd(area, code, cname, retry)
     elif res_code == 200:
         try:
             c = eval(res.text)['c']
         except Exception as err:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d  " % (cname, retry, res_code)
             print "get_gd --- exception res.text:\n", res.text
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         if len(c) == 0:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d  len(c)=0" % (cname, retry, res_code)
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
         try:
             list_gd = eval(result)
         except Exception as err:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code)
             print 'get_gd --- eval(result) exception , result:\n',result
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         return list_gd
     else:
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
         if retry < 5:
             time.sleep(0.1)
             return self.get_gd(area, code, cname, (retry+1))
         else:
             return None
Example #18
0
    def get_detail(self, line, cnt, retry):
        tid = self.get_tid()
        try:
            param = eval(line)
        except Exception as err:
            print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt)
            self.record_spider(line,'UNKNOW')
            return
        cname = param['oc_name']
        if cname in self.bloom:
            cname = param['query_name']
            if cname in self.bloom:
                print 'query_name:%s aleready crawler...'%cname
                return
        ccode = param['oc_code']
        carea = param['oc_area']
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code" : ccode,#code,  #"71526726X"
            "v1" : "QZOrgV005",
            "isDirect" : "0",
            "bl_oc_name" : cname,#cname,  #"腾讯科技"
            "bl_oc_area" : carea #area #"4403"
        }
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None :
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
                return
            else:
                # if retry > 5:
                #     self.query_failure.append(line)
                #     self.record_spider(line, cname)
                #     return
                # else:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none'))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code
        if (res_code >= 400 and res_code < 500) or res_code == 202 :
            #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code
            # if retry > 20:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            if self.get_fail_cnt(1, 'failcount-400') > 30:
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400'))
            return
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 5:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            time.sleep(2)
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  exception res.text " % (tid, cnt, cname, retry, res_code)
                #print "exception res.text:\n", res.text
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                detail = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception result:%s" % (tid, cnt, cname, retry, res_code, result)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return

            #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail)
            #print 'tid=', tid, ' detail=',spider.util.utf8str(detail)

            #股东信息
            listGD = self.get_gd(carea, ccode, cname, 0)
            if listGD is not None:
                #print "tid=",tid," listGD=",spider.util.utf8str(listGD)
                detail['listGD'] = listGD['listGD']

            #投资信息
            list_inversted = self.get_inversted(cname, 0)
            if list_inversted is not None:
                #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted)
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []}, 0)
            if list_branch is not None:
                #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch)
                detail['Branch'] = list_branch['Branch']

            self.query_success.append(spider.util.utf8str(detail))
            self.record_spider(line, cname)

            print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail))
        else:
            self.query_failure.append(line)
            self.record_spider(line, cname)
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code)
            return
Example #19
0
    def flip_over(self, now_page, cname):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        extJson = {
            "cl_screenSize": "640x960",
            "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
            "Org_iOS_Version": "2.0.1"
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self._aes_.encrypt(spider.util.utf8str(extJson))
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url, headers=headers, data=param)
        if res is None:
            print 'res is none -- search company name is -->', cname
            self.fail_name.append(cname)
            return
        elif res.code == 404:
            print "%s ------ 404" % cname
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (cname, res.code)
            self.add_job({'cname': cname})
            time.sleep(0.5)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                return
            print 'cname %s result ################### now get list length is %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.save_success.append(spider.util.utf8str(aa))
                x = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.part_success.append(x)

            print "-------------------------------------------cname %s page %d finish-----------------------------------" % (
                cname, now_page)
            rowcount = dic['rowcount']
            print "==============cname %s=======page %d=========rowcount %d===========" % (
                cname, now_page, rowcount)
            # page_count = rowcount/20 if rowcount%20==0 else (rowcount/20+1)
            # if now_page < page_count:
            #     now_page += 1
            #     self.flip_over(now_page,cname)
            # time.sleep(0.1)
            now_page += 1
            time.sleep(0.1)
            self.flip_over(now_page, cname)
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
Example #20
0
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = "multiADSL"
        self.proxies = {}
        if self.is_debug == "singleADSL":
            #单一代理ADSL模式
            Spider.__init__(self, 200)
            self.proxy_error_cnt = 0
        elif self.is_debug == "kuaidaili":
            #快代理模式
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        elif self.is_debug == "multiADSL":
            #多代理ADSL模式
            #proxies1 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            #proxies2 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            proxies1 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies2 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies3 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies4 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies5 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            self.proxies_dict = [proxies1, proxies2, proxies3,
                                 proxies4]  #, proxies5]
            Spider.__init__(self, 400)
        self._aes_ = CCIQ_AES()
        #成功拿到的详情
        self.query_success = FileSaver("成功拿到的详情900.txt")
        #失败的
        self.query_failure = FileSaver("获取失败的机构代码和原因900.txt")
        #已经爬取过的列表
        self.already_cname_list = FileSaver("已经爬过机构代码900.txt")
        #结果http 为400的code
        self.result400 = FileSaver("结果http=400的机构代码900.txt")
        #初始化已经爬过的公司
        self.init_cname()

        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]
        self.is_first = True
        self.init_time = 0
        self.lock = threading.Lock()
        self.req_cnt = 0
Example #21
0
    def run_job(self, jobid):
        code = jobid.get("code")
        retry = jobid.get("retry")

        tid = self.get_tid()
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code": code,  #code,  #"71526726X"
            "v1": "QZOrgV005",
            "isDirect": "0",
            "bl_oc_name": "腾讯科技",  #cname,  #"腾讯科技"
            "bl_oc_area": ""  #area #"4403"
        }
        detail = {}
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None:
            print code, "get detail     res is None !!"
            return
        res_code = res.code
        if res_code == 400:
            self.result400.append(code)
            self.req_cnt += 1
            return
        try:
            if u"服务不可用。" in res.text or u"Unauthorized!" in res.text:  # or u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志。" in res.text:
                self.re_add_job({'cname': code, 'retry': retry})
                print "系统不可用...", code, res.text
                return
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  exception " % (
                tid, retry,
                res_code), err  #res.text=%s#, spider.util.utf8str(res.text)
            self.re_add_job({'cname': code, 'retry': retry})
            return
        if len(c) == 0:
            print "tid=%d --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (
                tid, retry, res_code)
            self.query_failure.append(code + ",c=0")
            self.record_spider(code)
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            detail = eval(result)
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  --- exception result:%s" % (
                tid, retry, res_code, result)
            self.query_failure.append(code + ",result_error")
            self.record_spider(code)
            return

        cname = None
        try:
            basic = detail["list"]
            if basic is None or len(basic) == 0:
                print code, " 此码无效...", spider.util.utf8str(detail)
                self.query_failure.append(code + ",list=0")
                self.record_spider(code)
                return
            cname = basic[0]["oc_name"]
        except Exception as err:
            print code, "获取基本详情错误,拿不到oc_name,detail : ", spider.util.utf8str(
                detail)
            return

        #股东信息
        # listGD = self.get_gd(code)
        # if listGD is not None:
        #     #print "tid=", tid, " listGD=", spider.util.utf8str(listGD)
        #     detail['listGD'] = listGD['listGD']

        #投资信息
        # list_inversted = self.get_inversted(cname)
        # if list_inversted is not None:
        #     #print "tid=", tid, " list_inversted=", spider.util.utf8str(list_inversted)
        #     detail['inversted'] = list_inversted['inversted']

        # #获取分支机构信息
        # branch = []
        # list_branch = self.get_branch(cname, list_branch=branch)
        # if list_branch is not None:
        #     #print "tid=", tid, " list_branch=", spider.util.utf8str(list_branch)
        #     detail['Branch'] = list_branch #['Branch']
        self.query_success.append(spider.util.utf8str(detail))
        self.record_spider(code)

        print "tid=%d --- retry=%d --- res.code=%d  @@@ success: %s \n " % (
            tid, retry, res_code, spider.util.utf8str(
                self.proxies)), spider.util.utf8str(detail)
Example #22
0
    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson)
        res_code = 0
        if res is None:
            if self.get_fail_cnt('failcount-none', 1) < 10:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                # if retry > 5:
                #     r_result["type"] = "None"
                #     self.already_error_type.append(spider.util.utf8str(r_result))
                #     self.record_spider(cname)
                #     print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page)
                # else:
                #     self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)})
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-none', 0), tid))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code

        if (res_code >= 400 and res_code < 500) or res_code == 202:
            if self.get_fail_cnt('failcount-400', 1) < 5:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                if retry > 5:
                    r_result["type"] = "400+"
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                else:
                    self.re_add_job({
                        'cname': cname,
                        'cnt': cnt,
                        'retry': (retry + 1)
                    })
                    self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-400', 0), tid))
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 2:
            #     r_result["type"]="500"
            #     self.already_error_type.append(spider.util.utf8str(r_result))
            #     self.record_spider(cname)
            # else:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d " % (
                tid, cnt, cname, retry, res_code, now_page)
            time.sleep(random.randrange(1, 10, 1))
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text - %s" % (
                    tid, cnt, cname, retry, res_code, now_page, err)
                # r_result["type"] = "res_error"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                # self.record_spider(cname)
                # self.error_cnt += 1
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "c=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                dic = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                    tid, cnt, cname, retry, res_code, now_page, result)
                r_result["type"] = "result_error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            list = dic['list']
            if len(list) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "list=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list))
            for l in list:
                aa = {"query_name": cname}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_list.append(spider.util.utf8str(aa))
            print "******", len(list), spider.util.utf8str(list)
            if len(list) < 20:
                # r_result["type"] = "success"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            elif len(list) == 20:
                if now_page > 100:
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                    return
                now_page += 1
                self.flip_over(now_page, cname, cnt, retry)
        else:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % (
                tid, cnt, cname, retry, res_code, now_page)
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            r_result["type"] = "unknown_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            return
Example #23
0
    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson, cname=cname)
        res_code = 0
        if res is None:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            return
        if u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志" in res.text:
            print "处理请求时服务器遇到错误。有关详细信息,请参见服务器日志..."
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            else:
                r_result["type"] = "request-server-error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return

        try:
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text = %s" % (
                tid, cnt, cname, retry, res_code, now_page,
                spider.util.utf8str(res.text))
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
            else:
                r_result["type"] = "res.text=invalid"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
            return
        if len(c) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "c=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            dic = eval(result)
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                tid, cnt, cname, retry, res_code, now_page, result)
            r_result["type"] = "result_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        list = dic['list']
        if len(list) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "list=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        for l in list:
            aa = {"query_name": cname}
            for k, v in l.items():
                aa[k] = v
            self.query_company_list.append(spider.util.utf8str(aa))
        print "******", len(list), spider.util.utf8str(list)
        if len(list) < 20:
            self.record_spider(cname)
            return
        elif len(list) == 20:
            if now_page > 2:
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            now_page += 1
            self.flip_over(now_page, cname, cnt, retry)
Example #24
0
    def get_detail(self, cname, code, area):
        """
        查询某公司详细信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code": code,  #"71526726X"
            "v1": "QZOrgV004",
            "isDirect": "1",
            "bl_oc_name": cname,  #"腾讯科技"
            "bl_oc_area": area  #"4403"
        }

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_detail ------ res is none ,---->cname=', cname
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 404:
            print "get_detail ------ 404 ------ ", cname, code
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_detail ------ ', res.code, cname, code
            time.sleep(0.5)
            self.get_detail(cname, code, area)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------code ', code, ' res.text is null----------------------------'
                self.detail_failure.append(cname + "|" + str(code) + "|" +
                                           str(area))
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            detail = eval(result)
            #获取股东信息
            listGD = self.get_gd(area, code)
            if listGD is not None:
                detail['listGD'] = listGD['listGD']

            #获取投资信息
            list_inversted = self.get_inversted(cname)
            if list_inversted is not None:
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []})
            if list_branch is not None:
                detail['Branch'] = list_branch['Branch']

            print 'detail=================================', spider.util.utf8str(
                detail)
            self.detail_company.append(spider.util.utf8str(detail))
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)