def __init__(self, threadcnt): self.networker = BasicRequests() self.thread_count = threadcnt self._tls = threading.local() self._logport = 6666 self._name = 'spider' self._threads = [] self._reporter = None self._worker_count = 0 self._stat = None self._end_mark = 0 self._running_count = 0 #正在执行任务的线程数量. self.locker = threading.RLock() self.job_mask = 'mnf' self.condtion = threading.Condition() self._jobq = None self._no_more_wait_job = False # 这个变量用于通知wait_for_condition函数,不会再有新的任务来了,不要再等了.
def __init__(self, threadcnt): BasicRequests.__init__(self) self.thread_count = threadcnt self.job_queue = Queue.Queue(100) self.job_queue2 = Queue.LifoQueue() self.job_queue3 = Queue.Queue() #for failed jobs. self._tls = threading.local() self._logport = 5555 self._end_mark = 0 self._mjlock = threading.RLock() self._mjlocktime = 0 self._name = 'spider' self._threads = [] self._reporter = None self._dispatcher = None self._worker_count = 0 self.curjobid = 0 self.enable_mainjob_timedlock = False self._start_timet = time.time() self._start_datetime = None self._running_count = 0
def __init__(self): self.req = BasicRequests() self.req.load_proxy('../../conf/zhilian_proxy')
class PageExpireDetect(object): def __init__(self): self.req = BasicRequests() self.req.load_proxy('../../conf/zhilian_proxy') def load_proxy(self, fn, index=-1, auto_change=True): self.req.load_proxy(fn, index, auto_change) def lagou_page_detect(self, url): resp = self.req.request_url(url) if isinstance(resp.text, unicode): resp.text = resp.text.encode("utf-8") htl = html.fromstring(resp.text) if htl.xpath("/div[@class='position_del']"): return False if "亲,你来晚了,该信息已经被删除鸟" in resp.text: return False return True def jd51job_page_detect(self, url): # 过期返回 False, 否则返回True resp = self.req.request_url(url) resp.encoding = 'gb2312' if u"很抱歉,你选择的职位目前已经暂停招聘" in resp.text: return False return True def zhilian_page_detect(self, url): resp = self.req.request_url(url) if "http://img01.zhaopin.cn/2014/seo/images/outmoded_01.png" in resp.text: return False htl = html.fromstring(resp.text) if htl.xpath("//div[@class='returnpage']"): return False srcs = htl.xpath("//div[@class='inner-right fr']/img/@src") for src in srcs: if "http://img01.zpin.net.cn/2014/rd2/img/company_gq.png" in src: return False return True def liepin_page_detect(self, url): con = self.req.request_url(url) if con is None or u'抱歉, 您访问的页面不存在或已删除' in con.text: return False if u'抱歉,您查看的职位已过期' in con.text: return False if u'该职位已结束' in con.text: return False return True def wealink_page_detect(self, url): resp = self.req.request_url(url) htl = html.fromstring(resp.text)
class Spider2(object): class Stats: def __init__(self): self.job_count = 0 self.mjob_count = 0 self.start_time = time.time() self.start_date = datetime.datetime.now() self.curjobstr = "" self.curjob = {} def report_str(self): timespan = str(datetime.datetime.now() - self.start_date) reportstr = "prog:%s\nlast job is %s\nDONE time used:%s\n" % ( ' '.join(sys.argv), self.curjobstr, timespan) reportstr += "mj: %d aj:%d\n" % (self.mjob_count, self.job_count) return reportstr def msg_str(self, mqsz, qsz): progargs = (self.mjob_count, mqsz, self.job_count, qsz) prog = "mj:%d/%d,aj:%d/%d" % progargs if self.curjob.has_key('url'): cjstr = util.utf8str(self.curjob['url']) else: cjstr = util.utf8str(self.curjob) cjstr = re.sub(r'\r|\n', '', cjstr) if len(cjstr) > 100: cjstr = cjstr[0:100] return "[pid=%d]job:%s prog:%s\n" % (os.getpid(), cjstr, prog) def __init__(self, threadcnt): self.networker = BasicRequests() self.thread_count = threadcnt self._tls = threading.local() self._logport = 6666 self._name = 'spider' self._threads = [] self._reporter = None self._worker_count = 0 self._stat = None self._end_mark = 0 self._running_count = 0 #正在执行任务的线程数量. self.locker = threading.RLock() self.job_mask = 'mnf' self.condtion = threading.Condition() self._jobq = None self._no_more_wait_job = False # 这个变量用于通知wait_for_condition函数,不会再有新的任务来了,不要再等了. def request_url(self, url, **kwargs): return self.networker.request_url(url, **kwargs) def run(self, async=False): if (len(self._threads) > 0 or self._reporter is not None): raise RuntimeError("already run??") self._threads = [] self._worker_count = 0 self._stat = Spider2.Stats() self._jobq = cutil.JobQueue('%s.job.bin' % self._name) if self._jobq.get_size() == 0: self.init_jobs() self._reporter = threading.Thread(target=self.report) self._reporter.start() runtime.Runtime.set_thread_name(self._reporter.ident, "%s.job.reporter" % self._name) for i in range(0, self.thread_count): t = threading.Thread(target=self._job_runner, args=(i, )) t.start() runtime.Runtime.set_thread_name(t.ident, "%s.worker.%d" % (self._name, i)) self._threads.append(t) self.event_handler('STARTED', '') if not async: self.wait_run(True)