Esempio n. 1
0
 def __init__(self, threadcnt):
     self.networker = BasicRequests()
     self.thread_count = threadcnt
     self._tls = threading.local()
     self._logport = 6666
     self._name = 'spider'
     self._threads = []
     self._reporter = None
     self._worker_count = 0
     self._stat = None
     self._end_mark = 0
     self._running_count = 0  #正在执行任务的线程数量.
     self.locker = threading.RLock()
     self.job_mask = 'mnf'
     self.condtion = threading.Condition()
     self._jobq = None
     self._no_more_wait_job = False  # 这个变量用于通知wait_for_condition函数,不会再有新的任务来了,不要再等了.
Esempio n. 2
0
 def __init__(self, threadcnt):
     BasicRequests.__init__(self)
     self.thread_count = threadcnt
     self.job_queue = Queue.Queue(100)
     self.job_queue2 = Queue.LifoQueue()
     self.job_queue3 = Queue.Queue() #for failed jobs.
     self._tls = threading.local()
     self._logport = 5555
     self._end_mark = 0
     self._mjlock = threading.RLock()
     self._mjlocktime = 0
     self._name = 'spider'
     self._threads = []
     self._reporter = None
     self._dispatcher = None
     self._worker_count = 0
     self.curjobid = 0
     self.enable_mainjob_timedlock = False
     self._start_timet = time.time()
     self._start_datetime = None
     self._running_count = 0
Esempio n. 3
0
 def __init__(self):
     self.req = BasicRequests()
     self.req.load_proxy('../../conf/zhilian_proxy')
Esempio n. 4
0
class PageExpireDetect(object):

    def __init__(self):
        self.req = BasicRequests()
        self.req.load_proxy('../../conf/zhilian_proxy')

    def load_proxy(self, fn, index=-1, auto_change=True):
        self.req.load_proxy(fn, index, auto_change)

    def lagou_page_detect(self, url):

        resp = self.req.request_url(url)
        if isinstance(resp.text, unicode):
            resp.text = resp.text.encode("utf-8")
        htl = html.fromstring(resp.text)

        if htl.xpath("/div[@class='position_del']"):
            return False

        if "亲,你来晚了,该信息已经被删除鸟" in resp.text:
            return False

        return True

    def jd51job_page_detect(self, url):

        # 过期返回 False, 否则返回True
        resp = self.req.request_url(url)
        resp.encoding = 'gb2312'
        if u"很抱歉,你选择的职位目前已经暂停招聘" in resp.text:
            return False
        return True

    def zhilian_page_detect(self, url):

        resp = self.req.request_url(url)
        if "http://img01.zhaopin.cn/2014/seo/images/outmoded_01.png" in resp.text:
            return False

        htl = html.fromstring(resp.text)

        if htl.xpath("//div[@class='returnpage']"):
            return False

        srcs = htl.xpath("//div[@class='inner-right fr']/img/@src")
        for src in srcs:
            if "http://img01.zpin.net.cn/2014/rd2/img/company_gq.png" in src:
                return False

        return True

    def liepin_page_detect(self, url):
        con = self.req.request_url(url)
        if con is None or u'抱歉, 您访问的页面不存在或已删除' in con.text:
            return False
        if u'抱歉,您查看的职位已过期' in con.text:
            return False
        if u'该职位已结束' in con.text:
            return False

        return True

    def wealink_page_detect(self, url):
        resp = self.req.request_url(url)
        htl = html.fromstring(resp.text)
Esempio n. 5
0
class Spider2(object):
    class Stats:
        def __init__(self):
            self.job_count = 0
            self.mjob_count = 0
            self.start_time = time.time()
            self.start_date = datetime.datetime.now()
            self.curjobstr = ""
            self.curjob = {}

        def report_str(self):
            timespan = str(datetime.datetime.now() - self.start_date)
            reportstr = "prog:%s\nlast job is %s\nDONE time used:%s\n" % (
                ' '.join(sys.argv), self.curjobstr, timespan)
            reportstr += "mj: %d aj:%d\n" % (self.mjob_count, self.job_count)
            return reportstr

        def msg_str(self, mqsz, qsz):
            progargs = (self.mjob_count, mqsz, self.job_count, qsz)
            prog = "mj:%d/%d,aj:%d/%d" % progargs
            if self.curjob.has_key('url'):
                cjstr = util.utf8str(self.curjob['url'])
            else:
                cjstr = util.utf8str(self.curjob)
            cjstr = re.sub(r'\r|\n', '', cjstr)
            if len(cjstr) > 100:
                cjstr = cjstr[0:100]
            return "[pid=%d]job:%s prog:%s\n" % (os.getpid(), cjstr, prog)

    def __init__(self, threadcnt):
        self.networker = BasicRequests()
        self.thread_count = threadcnt
        self._tls = threading.local()
        self._logport = 6666
        self._name = 'spider'
        self._threads = []
        self._reporter = None
        self._worker_count = 0
        self._stat = None
        self._end_mark = 0
        self._running_count = 0  #正在执行任务的线程数量.
        self.locker = threading.RLock()
        self.job_mask = 'mnf'
        self.condtion = threading.Condition()
        self._jobq = None
        self._no_more_wait_job = False  # 这个变量用于通知wait_for_condition函数,不会再有新的任务来了,不要再等了.

    def request_url(self, url, **kwargs):
        return self.networker.request_url(url, **kwargs)

    def run(self, async=False):
        if (len(self._threads) > 0 or self._reporter is not None):
            raise RuntimeError("already run??")

        self._threads = []
        self._worker_count = 0
        self._stat = Spider2.Stats()

        self._jobq = cutil.JobQueue('%s.job.bin' % self._name)
        if self._jobq.get_size() == 0:
            self.init_jobs()
        self._reporter = threading.Thread(target=self.report)
        self._reporter.start()
        runtime.Runtime.set_thread_name(self._reporter.ident,
                                        "%s.job.reporter" % self._name)

        for i in range(0, self.thread_count):
            t = threading.Thread(target=self._job_runner, args=(i, ))
            t.start()
            runtime.Runtime.set_thread_name(t.ident,
                                            "%s.worker.%d" % (self._name, i))
            self._threads.append(t)

        self.event_handler('STARTED', '')
        if not async:
            self.wait_run(True)