def extract_content(self): doc = self.get_cur_doc() hf = spider.util.htmlfind(doc.cur_content, 'id="resumeContentBody"', 0) dom = html.fromstring(doc.cur_content) contact_info = self.extract_info(dom) name = contact_info.get("name", "") email = contact_info.get("email", "") telephone = contact_info.get("telephone", "") if not (name and (email or telephone)): self.log.info("fail id: %s, extract contact infomation fail" % self.get_cur_doc().cur_jdid) return None try: detail = hf.get_text() except: Log.errorbin("invalid cv content %s" % doc.cur_url, doc.cur_content) return None return utf8str(contact_info) + utf8str(detail)
def extract_content(self): dom = html.fromstring(self.get_cur_doc().cur_content) xx = dom.xpath("//td[@id='divInfo']") contact_info = CV51DownloadPageStore.extract_info(dom) name = contact_info.get("name", "") email = contact_info.get("email", "") telephone = contact_info.get("telephone", "") if not (name and (email or telephone)): self.log.info("fail id: %s, extract contact infomation fail" % self.get_cur_doc().cur_jdid) return None if xx is not None and len(xx) > 0: return utf8str(contact_info) + utf8str(xx[0].text_content()) Log.errorbin(self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) Log.error("get cv failed", self.get_cur_doc().cur_url) return None
def jobrunner1(self, job): con = self.sesnreq.request_url( "http://www.x315.com/", headers={"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"}) # self.sesnreq.request_url("http://s4.cnzz.com/stat.php?id=1256666136&show=pic1") url = r"http://www.x315.com/quicksearch?qk=%s&t=1&z=×tamp=%s" % ( "富士康", str(time.time()).split(".")[0] + str(time.time()).split(".")[1][:3]) header = { "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Connection": "Keep-alive", "Content-Type": "application/x-www-form-urlencoded", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Referer": "http://www.x315.com/", "X-Requested-With": "XMLHttpRequest" } con = self.sesnreq.request_url(url, headers=header) print con.text if ur"查询过于频繁" in con.text: if not self.re_add_job(job): Log.error("readd job failed.==>" + utf8str()) Log.info("查询过于频繁,sleep 10 s.") time.sleep(10)
def run_job(self, job): time.sleep(5) threadident = str(threading.currentThread().ident) sq = getattr(self._curltls, "sq", None) if sq is None: sq = self.init_req() Log.info("Running job:" + util.utf8str(job)) if job["type"] == "u1": Log.info("Searching line %d" % job["lineno"]) con = sq.request_url( r"http://qiye.qianzhan.com/orgcompany/searchList", data={ "oc_name": job["key"], "od_orderby": 0, "page": 1, "pageSize": 10, "oc_area": "", "sh_searchType": 1 }) if con is None or con.text.strip() == "" or con.code != 200: Log.error("[u1]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = "" try: jsonobj = json.loads(con.text.strip()) except ValueError as e: Log.error("Json decode error. String is %s" % con.text) return if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error("[u1]Request fail, succ flag is False. JOB=>" + util.utf8str(job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: corplist = jsonobj["lst"] if len(corplist) == 0: Log.error("Search return nothing. %d:%s, no data." % (job["lineno"], job["key"])) return else: for corp in corplist: jobb = { "type": "u2", "orgCode": corp["oc_orgCode"], "name": corp["oc_name"] } self.add_job(jobb) if job["type"] == "u2": Log.info("Getting detail info about %s" % job["name"]) timestr = "%f" % time.time() con0 = sq.request_url( r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s" % (timestr.split(".")[1], timestr.split(".")[0])) if con0 is None or con0.text.strip() == "" or con0.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return if not os.path.exists(threadident): os.mkdir(threadident) f = open(threadident + "/qycxb.js", "w+b") f.write(r'var window = {document : {cookie :"qznewsite.uid=' + sq.get_cookie("qznewsite.uid").strip() + '"}}; ' + con0.text + "console.log(window.__qzmcf())") f.flush() os.system("nodejs " + threadident + "/qycxb.js > " + threadident + "/mcfcode.txt") mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip() con1 = sq.request_url( "http://qiye.qianzhan.com/orgcompany/SearchItemDtl", data={ "mcfCode": mcfcode, "orgCode": job["orgCode"] }) if con1 is None or con1.text.strip() == "" or con1.code != 200: Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__())) self.re_add_job(job) return else: jsonobj = json.loads(con1.text.strip()) if not jsonobj["succ"]: Log.warning(jsonobj.__str__()) time.sleep(1) Log.error( "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>" + util.utf8str(job.__str__())) if 'status' in jsonobj and jsonobj['status'] == '4': Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy]) sq.remove_curproxy() self.re_add_job(job) else: #self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip()) self.filesaver.append(job["name"] + "|" + job["orgCode"] + "|" + con1.text.strip()) Log.info("%s,%s,saved." % (job["name"], job["orgCode"])) return
def run_job(self, job): Log.info("running job:" + utf8str(job)) if job["type"] == "t1": self.jobrunner1(job) elif job["type"] == "t2": self.jobrunner2(job)
elif key == u"海南": outmap[key] = "4600" elif len(cclist[key]) == 1: thekey = cclist[key].keys()[0] outmap[key] = thekey else: preflist = [] for thekey, v in cclist[key].items(): for code, name in v: if code[-2:] == '00': preflist.append(code) if len(preflist) == 0: pass elif len(preflist) == 1: outmap[key] = preflist[0][0:4] else: assert not "nani?" fout = {} for k, v in outmap.items(): fout[k] = v for k, v in fulloutmap.items(): if len(v) == 1: fout[k] = v[0] return fout spider.util.use_utf8() allmap = get_area_code() print utf8str(allmap)
def need_split(self, url, level, isLast): tol = util.utf8str(url).strip() if tol in self.oldjobs: return False raise RuntimeError('virtual function called.')
def log_url(self, url): tol = util.utf8str(url).strip() if tol in self.oldjobs: return self.fs.append(tol)