Example #1
0
    def __init__(self, channel):
        self.channel = channel
        BaseTask.__init__(self, "{}_expire_detect".format(channel))
        self.page_store = PageStore(channel)
        self.process_items = []
        self.page_expire_detect = PageExpireDetect()

        self.test_mode = False
Example #2
0
class JdExpireDetect(BaseTask):
    def __init__(self, channel):
        self.channel = channel
        BaseTask.__init__(self, "{}_expire_detect".format(channel))
        self.page_store = PageStore(channel)
        self.process_items = []
        self.page_expire_detect = PageExpireDetect()

        self.test_mode = False

    def pre_check(self, item):
        now = int(time.time() * 1000)
        # 只检测3天之内,没标记为过期的
        if "updateTime" in item and item["updateTime"] < now - 3 * 24 * 3600:
            if "isExpired" not in item or item['isExpired'] == 0:
                self.process_items.append({"indexUrl": item["indexUrl"], "realUrl": item["realUrl"]})

    def _load_data(self):
        for item in self.page_store.get_all():
            self.pre_check(item)
            if len(self.process_items) and len(self.process_items) % 10000 == 0:
                print "load {} items".format(len(self.process_items))

        print "totally load {} items".format(len(self.process_items))

    def dispatcher(self, q):
        for item in self.process_items:
            q.put(item)

        q.put(None)
        self.wait_q()

    def check_expire_by_channel(self, job, channel):
        if "jd_lagou" == channel:
            return self.page_expire_detect.lagou_page_detect(job["realUrl"])
        if "jd_51job" == channel:
            return self.page_expire_detect.jd51job_page_detect(job["realUrl"])
        if "jd_zhilian" == channel:
            return self.page_expire_detect.zhilian_page_detect(job["realUrl"])
        if "jd_wealink" == channel:
            return self.page_expire_detect.wealink_page_detect(job["realUrl"])
        if 'jd_liepin' == channel:
            return self.page_expire_detect.liepin_page_detect(job['realUrl'])

    def run_job(self, job):
        if not isinstance(job, dict):
            return
        try:
            if not self.check_expire_by_channel(job, self.channel):
                if not self.test_mode:
                    self.page_store.set_expire({"indexUrl": job["indexUrl"]})
                print "set expired , indexUrl: {}".format(job["indexUrl"])
        except Exception as e:
            print e
            print "failed , indexUrl: {}".format(job["indexUrl"])

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg)
            return

        if "DONE" == evt:
            util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg)
            return