Exemple #1
0
 def save_info(self, job, jsonobj):
     with self.locker:
         if job["type"] == "QuerySummary":
             name = jsonobj.get("ENTNAME", "-")
             regNo = jsonobj.get("REGNO", "-")
             id = jsonobj.get("ID", "-")
             self.namefile.write(job["line"] + " " + name.encode("utf-8") +
                                 " " + regNo.encode("utf-8") + " " +
                                 id.encode("utf-8") + "\n")
             self.namefile.flush()
             self.binsaver.append(
                 name.encode("utf-8") + "_" + regNo.encode("utf-8"),
                 json.dumps(jsonobj))
             spider.runtime.Log.info("%s:%s=========>saved." %
                                     (job["Q"], name))
         elif job["type"] == "QueryAutoName":
             if "ERRCODE" in jsonobj:
                 if not self.re_add_job(job):
                     self.save_fail_info(job)
                 Log.error("ErrCode, proxy down.")
                 raise AccountErrors.NoAccountError()
             for name in jsonobj:
                 self.namefile.write(job["line"] + " " +
                                     name.encode("utf-8") + "\n")
                 self.namefile.flush()
                 spider.runtime.Log.info("%s:%s=========>saved." %
                                         (job["Q"], name))
Exemple #2
0
 def run_job(self, jobid):
     self.num_count += 1
     #print "job is ", jobid
     url = "http://www.jobui.com/job/%d/" % (jobid)
     # url = 'http://www.jobui.com/job/1962956760/'
     res = self.request_url(url)
     print "id:{}  , Page status: {} ".format(jobid, res.code)
     if res is None:
         print "%d failed, sleeping 10 secs." % jobid
         time.sleep(5)
         self.add_job(jobid)
         return
     elif res.code == 404:
         time.sleep(3)
         return
     elif res.code == 503:
         print "maybe speed too fast..."
         time.sleep(5)
         self.add_job(jobid)
         return
     elif res.code == 200:
         print "saving %d ..." % jobid
         with self._savelock:
             with open("jobid.txt", "a+b") as f:
                 f.write("%s\n" % jobid)
                 #f.flush()
                 #fn = 'jobui_job.%d.%d' % (jobid, int(time.time()))
                 #self.bs.append(fn, res.text)
         time.sleep(5)
     else:
         Log.error("unknown xxxxx")
         Log.errorbin("%s" % jobid, res.text)
         raise AccountErrors.NoAccountError('fatal error')
    def run_job(self, jobid):
        gsweb = getattr(self._curltls, "gsweb", None)
        if gsweb is None:
            gsweb = self.init_obj()
        cname = jobid.get("name")
        cnt = jobid.get("cnt")
        out = gsweb.search_company(cname)
        if out is None:
            self.job_retry(jobid)
            if self.get_fail_cnt("failcnt", 1) > 10:
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-none = [ %d ]" %
                    self.get_fail_cnt("failcnt", 0))
        else:
            setattr(self._curltls, "failcnt", 0)

            if len(out) == 0:
                print cnt, "--->", cname, '---> query list result length = 0'
                #self.record_spider(cname)
                self.result_null.append(cname)
                filter_name.add(cname)
            else:
                for oi in out:
                    self.success.append(spider.util.utf8str(oi))
                self.record_spider(cname)

            if time.time() - self.run_time > 20:
                print "speed------> ------> ------> ------> ------> ------>", self.cnt / (
                    time.time() - self.run_time), "t/s"
                self.run_time = time.time()
                self.cnt = 1
        time.sleep(random.randrange(1, 6, 1))
Exemple #4
0
 def get_fail_cnt(self, addv):
     fc = getattr(self._curltls, "failcnt", 0)
     if fc > 10:
         raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcnt = [ 10 ]")
     else:
         if addv:
             fc += addv
             setattr(self._curltls, "failcnt", fc)
Exemple #5
0
    def run_job(self, jobid):
        jobid_int = jobid.get("id")
        url = "http://www.jobui.com/job/%d/" % (jobid_int)
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies)
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.num_count += 1

        #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code))

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 404:
            print "%d ------ 404" % jobid_int
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d ------ %d " % (jobid_int, res.code)
            self.add_job(jobid)
            time.sleep(0.8)
            return
        elif res.code == 200:
            print "%d ------ saving " % jobid_int
            fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time()))
            self.bs.append(fn, res.text)
            if self.bs.getsize() >= 8 * 1024 * 1024 * 1024:
                raise AccountErrors.NoAccountError('file too large')
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            Log.error("unknown error...")
            Log.errorbin("%s" % jobid_int, res.text)
            raise AccountErrors.NoAccountError('fatal error')
Exemple #6
0
 def count_proxy_error(self, error_type):
     cnt = getattr(self._proxy_error, "proxy_error_cnt", 0)
     if error_type:
         setattr(self._proxy_error, "proxy_error_cnt", 0)
     else:
         if cnt > 10:
             raise AccountErrors.NoAccountError("THE PROXY IS INVALID ! ! !")
         else:
             setattr(self._proxy_error, "proxy_error_cnt", (cnt+1))
 def job_retry(self, job, addv):
     retry = job.get("retry")
     retry += 1
     job.update({"retry": retry})
     self.re_add_job(job)
     if self.get_fail_cnt("failcnt", addv) > 15:
         if not self.is_debug:
             raise AccountErrors.NoAccountError(
                 "Maybe the proxy invalid,failcount-none = [ %d ]" %
                 self.get_fail_cnt("failcnt", 0))
Exemple #8
0
    def run_job(self, jobid):
        url = jobid.get("url")
        print "url == ",url
        tid = self.get_tid()
        res = self.sessionReq.request_url(url)

        self.num_count += 1

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                self.__fail_urls.append(url)
                raise AccountErrors.NoAccountError("failcount = [ %d ]" % (self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls,'failcount',0)

        if res.code == 404:
            print "%s ------ 404" % url
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (url,res.code)
            self.add_job(jobid)
            time.sleep(0.8)
            return
        elif res.code == 200:
            print "%s ------ saving " % url
            self.parse(res.content)
            con = []
            type = {"key":con}
            str1 = json.dumps(type)
            self.savefile.append(str1)
            #print "content======\n",res.content
            #self.bs.append(fn, res.text)
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            #Log.error("unknown error...")
            #Log.errorbin("%s" %url, res.text)
            raise AccountErrors.NoAccountError('fatal error')
Exemple #9
0
    def run_job(self, job):
        Log.info("Running job:" + spider.util.utf8str(job))
        # thread_check
        if self.thread_check() == False:
            # raise NoAccountError to set end_this_thread true, spider will readd the job. See Spider._job_runner
            raise AccountErrors.NoAccountError()

        if job["type"] == "QuerySummary":
            self.get_summary(job)
        elif job["type"] == "QueryDetail":
            self.get_detail(job)
        elif job["type"] == "QueryAutoName":
            self.get_autoname(job)
Exemple #10
0
    def run_job(self, jobid):
        url = jobid.get("url")
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.request_count += 1

        if res is None:
            if self.get_fail_cnt(1) < 3:
                self.add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_add_url.append(url)
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 404:
            print "%s ======》 404" % url
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------> %d " % (url, res.code)
            self.add_job(jobid)
            time.sleep(1)
            return
        elif res.code == 200:
            print "%s ————> will be into database......." % url
            #http://www.jobui.com/job/92336088/
            m = re.search(ur'http://www.jobui.com/job/(\d+)/', url)
            if m:
                jid = m.group(1)
                self.page_store.save(int(time.time()), jid, url, res.text)
                self.success_count += 1
                self.parseDomain(res.text)
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            self.__fail_add_url.append(url)
            #raise AccountErrors.NoAccountError('fatal error')

        #if self.request_count % 10000 == range(0,9):
        print "request_count:{},success_count:{},request_speed:{}".format(
            self.request_count, self.success_count,
            self.request_count / (time.time() - self.start_time))
Exemple #11
0
    def run_job(self, job):
        gsweb = self.init_obj()
        kw = job.get("kw")
        retry = job.get("retry")
        cnt = job.get("cnt")
        out = gsweb.search_company(kw)
        if out is None:
            self.job_retry(job)
            return
        if len(out) != 0 and out[0] == "stop":
            self.job_retry(job)
            raise AccountErrors.NoAccountError("The proxy invalid , IP stop !!!")
        all = len(out)
        scs_cnt = 0
        for oi in out:
            cname = oi["name"]
            url = oi["url"]
            regcode = oi["regcode"]
            s = cname+","+str(regcode)
            if s in filter_queries:
                #如果已经爬取过了,略过
                all -= 1
                continue
            retry2 = 0
            while True:
                flag = gsweb.get_detail(url, cname, regcode)
                if flag:
                    self.record_spider_queries(s)
                    scs_cnt += 1
                    break
                else:
                    #self.get_fail_cnt(1)
                    retry2 += 1
                    if retry2 > 5:
                        break
                    else:
                        time.sleep(random.randrange(3, 8, 1))

        if scs_cnt == all:
            self.record_spider_kw(kw)
        else:
            self.job_retry(job)

        if time.time() - self.run_time > 20:
            print "speed------> ------> ------> ------> ------> ------>", self.cnt/(time.time() - self.run_time), "t/s"
            self.run_time = time.time()
            self.cnt = 1
Exemple #12
0
    def run_job(self, job):
        if job["type"] is "u1":
            key = job["kw"]
            page = str(job["page"])
            url = "http://qichacha.com/search?key=" + key + "&index=name&" + "p=" + page
            # con = self.qcclogin.request_url(url)
            con = self.qcc_acc_manager.el_request(url)
            res = con.text
            if res.strip() == "":
                time.sleep(10)
                self.add_job(job)
                return
            elif re.search(u'小查还没找到数据', res):
                Log.error("key=" + key + ", page=" + page + ", no data!\n")
            else:
                Log.error("searching %s" % key)
                urls = self._match(
                    res, r'<h3 class="site-list-title"><a href="(.*?)"')
                if len(urls) == 0:
                    Log.errorbin("%s %s" % (key, url), con.text)
                    raise AccountErrors.NoAccountError(key)
                for u in urls:
                    job2 = {"url": u, "type": "u2", "retrycnt": "0"}
                    self.add_job(job2)
                # catch page 1 only
                # if page is '1':
                #     corp_count = int(self._match(res, r'<span class="search-key">(.*?)</span>')[0])
                #     pg_count = (corp_count + 9)/10
                #     #not vip limit in 10 pages
                #     if pg_count >= 10:
                #         pg_count = 10
                #     for i in range(2, pg_count+1):
                #         job3 = {"kw": key, "page": str(i), "type": "u1"}
                #         self.add_job(job3)

        elif job["type"] is "u2":
            url = "http://qichacha.com" + job["url"]
            cpid = job["url"][1:]

            if self.pagestore.check_should_fetch(cpid):
                con = self.request_url(url)
                if con is None or self.retry(con, job):
                    return
                self.pagestore.save(int(time.time()), cpid, url, con.text)
            else:
                Log.warning("skip ", cpid)
Exemple #13
0
    def run_job(self, jobid):
        jobid_int = jobid.get("id")
        url = "http://www.jobui.com/job/%d/" % (jobid_int)
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.request_count += 1

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 404:
            print "%d ======》 404" % jobid_int
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d ------> %d " % (jobid_int, res.code)
            self.add_job(jobid)
            time.sleep(1)
            return
        elif res.code == 200:
            print "%d ————> will be into database......." % jobid_int
            self.page_store.save(int(time.time()), str(jobid_int), url,
                                 res.text)
            self.success_count += 1
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            self.__fail_ids.append(str(jobid_int))
            #raise AccountErrors.NoAccountError('fatal error')

        #if self.request_count % 10000 == range(0,9):
        print "request_count:{},success_count:{},request_speed:{}".format(
            self.request_count, self.success_count,
            self.request_count / (time.time() - self.start_time))
Exemple #14
0
    def flip_over(self, now_page, cname, line, cnt, retry):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])
        if res is None:
            if self.get_fail_cnt(1) < 10:
                print "%d-----%s ------ res is None" % (cnt, cname)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                #self.query_company_info_failure.append(line)
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 404 or res.code == 403:
            if self.get_fail_cnt(1) < 20:
                print "%d-----%s ------ %d" % (cnt, cname, res.code)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d------%s ------ %d " % (cnt, cname, res.code)
            self.add_job({'line': line, 'cnt': cnt})
            time.sleep(1)
            return False
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                self.query_company_info_failure.append(line)
                return True
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                self.query_company_info_failure.append(line)
                return True
            print 'cname %s result ###################  list length ------ %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_info.append(spider.util.utf8str(aa))
                part = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.query_company_info_part.append(part)
                self.get_detail(l['oc_name'], l['oc_code'], l['oc_area'])
            if len(list) < 20:
                return True
            elif len(list) == 20:
                now_page += 1
                self.flip_over(now_page, cname, line, cnt)
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
            self.query_company_info_failure.append(line)
            return True
    def run_job(self, jobid):
        tid = self.get_tid()
        gsweb = getattr(self._curltls, "gsweb", None)
        if gsweb is None:
            gsweb = self.init_obj()

        tp = jobid["type"]
        cnt = jobid.get("cnt")
        if tp == "query":
            qname = jobid.get("qname")
            if qname in filter_name:
                print cnt, "已经查询过:", qname
                return
            out = gsweb.search_company(qname)
            if out is None:
                self.job_retry(jobid, 1)
                return
            else:
                setattr(self._curltls, "failcnt", 0)
            if "PROXY-ERROR" in out:
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid - PROXY-ERROR ")
            elif len(out) == 0:
                print cnt, qname, ' 查询公司列表为空...'
                self.record_spider(qname)
                return
            else:
                for oi in out:
                    cname = oi["name"]
                    if cname in filter_name:
                        print cnt, "已经爬取过:", cname
                        return
                    job = {"oi": oi, "type": "detail", "cnt": cnt, "retry": 0}
                    self.add_main_job(job)
                    self.un_spider_name.append(cname)
                self.record_spider(qname)

        elif tp == "detail":
            oi = jobid["oi"]
            cname = oi["name"]
            url = oi["url"]
            regist_code = oi["regcode"]
            gd = "gsxt.gdgs.gov.cn/aiccips/GSpublicity/GSpublicityList.html"
            sz = "szcredit.com.cn/web/GSZJGSPT/QyxyDetail.aspx"
            gz1 = "gsxt.gzaic.gov.cn/search/search!entityShow"
            gz2 = "gsxt.gzaic.gov.cn/aiccips/GSpublicity/GSpublicityList.html"
            flg = None
            if gd in url:
                flg = gsweb.get_GSpublicityList(cnt, cname, url, regist_code)
            elif sz in url:
                flg = gsweb.get_QyxyDetail(cnt,
                                           cname,
                                           url,
                                           regist_code,
                                           tid=tid)
            elif gz1 in url:
                flg = gsweb.get_entityShow(cnt, cname, url, regist_code)
                #此链接跑完需重新初始化对象
                self.init_obj()
            elif gz2 in url:
                flg = gsweb.get_guangzhou(cnt, cname, url, regist_code)
            else:
                print "未知的链接类型--->", url
                Log.error("UNKNOWN LINK TYPE," + url)
                return

            if flg == "success":
                self.record_spider(cname)
            elif flg == "proxy_error":
                self.job_retry(jobid, 1)
            elif flg == "notdisplay":
                oi["error"] = "notdisplay"
                self.not_show_save.append(spider.util.utf8str(oi))
                #self.job_retry(jobid, 0)
            elif flg == "return_error":
                oi["error"] = "return_page_error"
                self.not_show_save.append(spider.util.utf8str(oi))
                #self.job_retry(jobid, 0)
            else:
                self.job_retry(jobid, 0)

        if time.time() - self.run_time > 20:
            print cnt, "====================== speed =====================", self.cnt / (
                time.time() - self.run_time), "t/s"
            self.run_time = time.time()
            self.cnt = 1
Exemple #16
0
    def get_detail(self, line, cnt, retry):
        tid = self.get_tid()
        try:
            param = eval(line)
        except Exception as err:
            print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt)
            self.record_spider(line,'UNKNOW')
            return
        cname = param['oc_name']
        if cname in self.bloom:
            cname = param['query_name']
            if cname in self.bloom:
                print 'query_name:%s aleready crawler...'%cname
                return
        ccode = param['oc_code']
        carea = param['oc_area']
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code" : ccode,#code,  #"71526726X"
            "v1" : "QZOrgV005",
            "isDirect" : "0",
            "bl_oc_name" : cname,#cname,  #"腾讯科技"
            "bl_oc_area" : carea #area #"4403"
        }
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None :
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
                return
            else:
                # if retry > 5:
                #     self.query_failure.append(line)
                #     self.record_spider(line, cname)
                #     return
                # else:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none'))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code
        if (res_code >= 400 and res_code < 500) or res_code == 202 :
            #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code
            # if retry > 20:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            if self.get_fail_cnt(1, 'failcount-400') > 30:
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400'))
            return
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 5:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            time.sleep(2)
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  exception res.text " % (tid, cnt, cname, retry, res_code)
                #print "exception res.text:\n", res.text
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                detail = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception result:%s" % (tid, cnt, cname, retry, res_code, result)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return

            #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail)
            #print 'tid=', tid, ' detail=',spider.util.utf8str(detail)

            #股东信息
            listGD = self.get_gd(carea, ccode, cname, 0)
            if listGD is not None:
                #print "tid=",tid," listGD=",spider.util.utf8str(listGD)
                detail['listGD'] = listGD['listGD']

            #投资信息
            list_inversted = self.get_inversted(cname, 0)
            if list_inversted is not None:
                #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted)
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []}, 0)
            if list_branch is not None:
                #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch)
                detail['Branch'] = list_branch['Branch']

            self.query_success.append(spider.util.utf8str(detail))
            self.record_spider(line, cname)

            print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail))
        else:
            self.query_failure.append(line)
            self.record_spider(line, cname)
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code)
            return
Exemple #17
0
    def run_job(self, jobid):
        jobid_int = jobid.get("id")
        retry = jobid.get("retry")
        url = "http://www.jobui.com/job/%d/" % (jobid_int)
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.request_count += 1
        self.serial_num += 1

        if res is None:
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0, 'failcount-none')))
            #return
        else:
            setattr(self._curltls, 'failcount-none', 0)

        if res.code == 407:
            if self.get_fail_cnt(1, 'failcount-407') < 10:
                self.re_add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0, 'failcount-407')))
            #return
        else:
            setattr(self._curltls, 'failcount-407', 0)

        if res.code == 404:
            print "%d ======》 404 ---> retry:%d" % (jobid_int, retry)
            if retry < 3:
                self.re_add_job({"id": jobid_int, "retry": (retry + 1)})
            else:
                self.__fail_ids.append(str(jobid_int))
            #return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d ------> %d " % (jobid_int, res.code)
            self.re_add_job(jobid)
            time.sleep(random.randrange(1, 3, 1))
            #return
        elif res.code == 200:
            self.serial_num = 0
            print "%d ————> will be into database......." % jobid_int
            self.page_store.save(int(time.time()), str(jobid_int), url,
                                 res.text)
            self.success_count += 1
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            if retry < 3:
                self.re_add_job({"id": jobid_int, "retry": (retry + 1)})
            else:
                self.__fail_ids.append(str(jobid_int))

        print "serial_number:{},request_count:{},success_count:{},request_speed:{}".format(
            self.serial_num, self.request_count, self.success_count,
            self.request_count / (time.time() - self.start_time))
Exemple #18
0
    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson)
        res_code = 0
        if res is None:
            if self.get_fail_cnt('failcount-none', 1) < 10:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                # if retry > 5:
                #     r_result["type"] = "None"
                #     self.already_error_type.append(spider.util.utf8str(r_result))
                #     self.record_spider(cname)
                #     print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page)
                # else:
                #     self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)})
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-none', 0), tid))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code

        if (res_code >= 400 and res_code < 500) or res_code == 202:
            if self.get_fail_cnt('failcount-400', 1) < 5:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                if retry > 5:
                    r_result["type"] = "400+"
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                else:
                    self.re_add_job({
                        'cname': cname,
                        'cnt': cnt,
                        'retry': (retry + 1)
                    })
                    self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-400', 0), tid))
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 2:
            #     r_result["type"]="500"
            #     self.already_error_type.append(spider.util.utf8str(r_result))
            #     self.record_spider(cname)
            # else:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d " % (
                tid, cnt, cname, retry, res_code, now_page)
            time.sleep(random.randrange(1, 10, 1))
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text - %s" % (
                    tid, cnt, cname, retry, res_code, now_page, err)
                # r_result["type"] = "res_error"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                # self.record_spider(cname)
                # self.error_cnt += 1
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "c=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                dic = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                    tid, cnt, cname, retry, res_code, now_page, result)
                r_result["type"] = "result_error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            list = dic['list']
            if len(list) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "list=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list))
            for l in list:
                aa = {"query_name": cname}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_list.append(spider.util.utf8str(aa))
            print "******", len(list), spider.util.utf8str(list)
            if len(list) < 20:
                # r_result["type"] = "success"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            elif len(list) == 20:
                if now_page > 100:
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                    return
                now_page += 1
                self.flip_over(now_page, cname, cnt, retry)
        else:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % (
                tid, cnt, cname, retry, res_code, now_page)
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            r_result["type"] = "unknown_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            return
Exemple #19
0
    def run_job(self, jobid):
        url = jobid.get("url")
        retry = jobid.get("retry")
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.request_count += 1

        if res is None:
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                #self.__fail_urls.append(url)
                self.re_add_job(jobid)
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0, 'failcount-none')))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 407:
            if self.get_fail_cnt(1, 'failcount-407') < 10:
                print "%s ======》 407  , retry:%d" % (url, retry)
                self.re_add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.re_add_job(jobid)
                #self.__fail_urls.append(url)
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0, 'failcount-407')))
            return
        else:
            setattr(self._curltls, 'failcount-407', 0)

        if res.code == 404:
            print "%s ======》 404  , retry:%d" % (url, retry)
            if retry < 3:
                self.re_add_job({"id": jobid_int, "retry": (retry + 1)})
            else:
                self.__fail_urls.append(url)
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------> %d " % (url, res.code)
            self.add_job(jobid)
            time.sleep(random.randrange(1, 3, 1))
            return
        elif res.code == 200:
            print "%s ————> will be into database......." % url
            m = re.search(ur'http://www.jobui.com/job/(\d+)/', url)
            if m:
                jid = m.group(1)
                self.page_store.save(int(time.time()), jid, url, res.text)
                self.success_count += 1
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ] retry:%d" % (
                res.code, retry)
            if retry < 3:
                self.re_add_job({"id": jobid_int, "retry": (retry + 1)})
            else:
                self.__fail_urls.append(url)

            #raise AccountErrors.NoAccountError('fatal error')

        print "request_count:{},success_count:{},request_speed:{}".format(
            self.request_count, self.success_count,
            self.request_count / (time.time() - self.start_time))