Exemple #1
0
 def __init__(self, a, b, c):
     AioRunner.__init__(self, a, b, c)
     self.baset = time.time()
     self.dstfile = open("corp_name.txt", "a+b")
     self.failfile = open("corp_fail.txt", "a+b")
     self.__jsl_clearance = None
     self.__jsluid = None
Exemple #2
0
    def prepare_req(self, job, curl, proxies):
        pr = AioRunner.prepare_req(self, job, curl, proxies)
        if pr is not None:
            return pr

        url, headers = {}
        curl.prepare_req(url, headers=headers, proxies=proxies)
        return True
Exemple #3
0
 def on_result(self, curl, resp):
     AioRunner.on_result(self, curl, resp)
     con = resp
     if con is None or con.text.strip() == "":
         spider.runtime.Log.error("Request return nothing! Readd...." +
                                  self.job.__str__())
         self.master.re_add_job(self.job)
         return
     else:
         corp_name_list = re.findall(
             r'<h3 class="site-list-title">(.*?)<small', con.text, re.S)
         if len(corp_name_list) == 0:
             spider.runtime.Log.warning("line " + str(self.job["lineno"]) +
                                        ", key:" + self.job["key"] +
                                        ", no data...")
             self.failfile.write(self.job["line"].strip() + " no data.\n")
             self.failfile.flush()
             return
         else:
             self.save_name(self.job, corp_name_list)
     print resp.request.url, resp.code
Exemple #4
0
 def on_result(self, curl, resp):
     AioRunner.on_result(self, curl, resp)
     con = resp
     if con is None or con.text.strip() == "":
         spider.runtime.Log.error("Request return nothing! Readd...." +
                                  self.job.__str__())
         self.master.re_add_job(self.job)
         return
     elif con.code == 521:
         f = open("login.js", "w+b")
         f.write(
             con.text.replace("<script>",
                              "").replace("</script>", "").replace(
                                  "document.cookie=dc", "console.log(dc)"))
         f.close()
         os.system("nodejs login.js > cookiestr.txt")
         f = open("cookiestr.txt", "r+b")
         self.__jsl_clearance = re.findall(r"__jsl_clearance=(.*?);",
                                           f.read(), re.S)[0]
         if "Set-Cookie:" in con.headers:
             setcookie = re.findall(r"Set-Cookie:(.*?)path", con.headers,
                                    re.S)[0]
             self.__jsluid = re.findall(r"__jsluid=(.*?);", setcookie,
                                        re.S)[0]
         self.master.re_add_job(self.job)
     else:
         corp_name_list = re.findall(
             r'class="search-result-title"><em>(.*?)</a>', con.text, re.S)
         if len(corp_name_list) == 0:
             spider.runtime.Log.warning("line " + str(self.job["lineno"]) +
                                        ", key:" + self.job["key"] +
                                        ", no data...")
             self.failfile.write(self.job["line"].strip() + " no data.\n")
             self.failfile.flush()
             return
         else:
             self.save_name(self.job, corp_name_list)
     print resp.request.url, resp.code
Exemple #5
0
    def prepare_req(self, job, curl, proxies):
        self.dbg('prepare')
        pa = AioRunner.prepare_req(self, job, curl, proxies)
        if pa is not None:
            return pa

        if 'value' in job:
            url = "https://www.linkedin.com/jobs2/view/%d" % job['value']
        else:
            url = job['url']
        print "[%d] prepare %s proxies=" % (self.idx, url), proxies
        headers={}
        if 'ip.cn' in url:
            headers['User-Agent'] = 'curl/7.20.1'
        curl.prepare_req(url, headers=headers, proxies=proxies)
        return True
Exemple #6
0
    def prepare_req(self, job, curl, proxies):
        self.dbg('prepare')
        pa = AioRunner.prepare_req(self, job, curl, proxies)
        if pa is not None:
            return pa

        if 'key' in job:
            key = spider.util.utf8str(job['key'])
            url = "http://qichacha.com/search?key=" + quote(key) + "&sType=0"
        else:
            Log.error("Invalid job.===>" + job.__str__())
        print "[%d] prepare %s proxies=" % (self.idx, url), proxies
        headers = {}
        if 'ip.cn' in url:
            headers['User-Agent'] = 'curl/7.20.1'
        curl.prepare_req(url, headers=headers, proxies=proxies)
        return True
Exemple #7
0
    def prepare_req(self, job, curl, proxies):
        self.dbg('prepare')
        pa = AioRunner.prepare_req(self, job, curl, proxies)
        if pa is not None:
            return pa

        if 'key' in job:
            key = spider.util.utf8str(job['key'])
            url = r"http://www.qixin007.com/search/?key=" + quote(
                key) + "&type=enterprise&source=&isGlobal=Y"
            # url = "http://qichacha.com/search?key=" + quote(key) + "&sType=0"
        else:
            Log.error("Invalid job.===>" + job.__str__())
        print "[%d] prepare %s proxies=" % (self.idx, url), proxies
        headers = {}
        if 'ip.cn' in url:
            headers['User-Agent'] = 'curl/7.20.1'
        if self.__jsl_clearance:
            headers["Cookie"] = "__jsl_clearance=" + self.__jsl_clearance + ";"
        if self.__jsluid:
            headers["Cookie"] += "__jsluid=" + self.__jsluid
        curl.prepare_req(url, headers=headers, proxies=proxies)
        return True
Exemple #8
0
 def on_error(self, curl, errcode, errmsg):
     AioRunner.on_error(self, curl, errcode, errmsg)
     print "[%d] error, proxy_errcnt=%d" % (self.idx, self.proxyerr)
     print "with: code=%d msg=%s" % (errcode, errmsg)
Exemple #9
0
 def __init__(self, a, b, c):
     AioRunner.__init__(self, a, b, c)
     self.baset = time.time()
     self.dstfile = open("corp_name.txt", "a+b")
     self.failfile = open("corp_fail.txt", "a+b")
Exemple #10
0
 def __init__(self, curl, selproxy, idx):
     AioRunner.__init__(self, curl, selproxy, idx)
Exemple #11
0
 def on_result(self, curl, resp):
     self.dbg('result')
     AioRunner.on_result(self, curl, resp)
     print resp.request.url, resp.code
Exemple #12
0
 def __init__(self, a, b,c):
     AioRunner.__init__(self, a, b, c)
     self.baset = time.time()