def get_count(jd_or_cv, channel):

    if jd_or_cv == "jd":
        store = PageStore(channel)
    elif jd_or_cv == "cv":
        store = CVPageStore(channel)
    elif jd_or_cv == "co":
        store = CoPageStore(channel)

    else:
        raise Exception("unknown jd_or_cv type: %s" % jd_or_cv)

    return store.count_all()
Beispiel #2
0
    def __init__(self, channel):
        self.channel = channel
        BaseTask.__init__(self, "{}_expire_detect".format(channel))
        self.page_store = PageStore(channel)
        self.process_items = []
        self.page_expire_detect = PageExpireDetect()

        self.test_mode = False
Beispiel #3
0
    def __init__(self, owner, queue_size, thread_cnt):
        BaseTask.__init__(self, owner, queue_size, thread_cnt)
        self.owner = owner
        self.process_item = {}
        self.expired_items = []
        self._raw_store = self._get_raw_store(owner)
        self._measure_store = self._get_measure_store(owner)
        self._page_store = PageStore(owner)
        self.page_store_db = 'admin'
        self.page_store_coll = "page_store_{}".format(owner)

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.jd_measure_client = self.thrift_client.jd_measure_server_client

        self.test_mode = False

        self.rs_file = FileSave(BaseTask.PathConfig.result_file)
        self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)
Beispiel #4
0
class ETLTask(BaseTask):
    def __init__(self, owner, queue_size, thread_cnt):
        BaseTask.__init__(self, owner, queue_size, thread_cnt)
        self.owner = owner
        self.process_item = {}
        self.expired_items = []
        self._raw_store = self._get_raw_store(owner)
        self._measure_store = self._get_measure_store(owner)
        self._page_store = PageStore(owner)
        self.page_store_db = 'admin'
        self.page_store_coll = "page_store_{}".format(owner)

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.jd_measure_client = self.thrift_client.jd_measure_server_client

        self.test_mode = False

        self.rs_file = FileSave(BaseTask.PathConfig.result_file)
        self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)

    def _get_raw_store(self, owner):
        if "jd_lagou" == owner:
            return LgEtlStore('raw')
        if "jd_51job" == owner:
            return Job51EtlStore('raw')
        if "jd_zhilian" == owner:
            return ZLEtlStore('raw')

        raise Exception(" unknown owner ")

    def _get_measure_store(self, owner):
        if "jd_lagou" == owner:
            return LgEtlStore('measure')
        if "jd_51job" == owner:
            return Job51EtlStore('measure')
        if "jd_zhilian" == owner:
            return ZLEtlStore('measure')

        raise Exception(" unknown owner ")

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        if indexUrl in self.process_item:
            if self.process_item[indexUrl]['updateTime'] < updateTime:
                self.process_item[indexUrl]['updateTime'] = updateTime
                self.process_item[indexUrl]['contentSign'] = contentSign
                self.process_item[indexUrl]['realUrl'] = realUrl
                self.process_item[indexUrl]['flag'] = flag
                self.process_item[indexUrl]['filePath'] = filePath

        else:
            self.process_item[indexUrl] = {
                'updateTime': updateTime,
                'contentSign': contentSign,
                'realUrl': realUrl,
                'filePath': filePath,
                'flag': flag,
            }

    def check(self, item):

        status = item.get('status', 0)
        isUpdated = item.get('isUpdated', 0)
        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        jdUrl = item.get('realUrl')
        expired = item.get('isExpired', 0)

        if status == 0:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 0)
        if status == 1 and isUpdated == 1:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 1)
        if expired == 1:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime,  file_path, 2)

    def _load_data(self):
        page_client = self._raw_store.cmgClient.page_store_mongo_client
        for item in page_client[self.page_store_db][self.page_store_coll].find():
            self.check(item)
            if len(self.process_item) and len(self.process_item) % 10000 == 0:
                print "load {} items".format(len(self.process_item))

        print "finish load {} items".format(len(self.process_item))
        print "finish load {} expired items".format(len(self.expired_items))
        # print "======================start dump ids to files================="
        # fd = FileSave('../data/results.txt')
        # for indexUrl in self.process_item:
        #     fd.append_end_with(indexUrl)
        #
        # print "=======================dump finish============================="

    def dispatcher(self, q):

        for item in self.process_item:
            q.put(item)

        for item in self.expired_items:
            q.put(item)

        q.put(None)
        self.wait_q()

    def getPageContent(self, filename):
        parts = filename.split("::")
        if len(parts) == 3:
            binReader = BinReader(parts[1])
            _, content = binReader.readone_at(int(parts[2]))
            if len(content) == 0:
                raise Exception("file name:{} , content error".format(filename))
            return content

        if len(parts) == 1:
            with open(filename) as f:
                content = f.read()
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content

    def update_jd(self, item):
        jd_store_key = {'jdId': item}
        page_store_key = {'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')}
        updateTime = self.process_item[item].get("updateTime")
        strTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(updateTime/1000))

        jkey = json.dumps(jd_store_key)
        jdoc = json.dumps({"$set": {"pubDate": strTime}})
        jdoc_m = json.dumps({"$set": {"pubDateStamp": updateTime}})

        if self.test_mode:
            print " Update ===> jdkey: {} \n jdoc: {} \n".format(jkey, jdoc)
            return

        if not self._raw_store.save_one(jkey, jdoc, False):
            raise Exception("update jd_raw pubTime Exception")
        if not self._measure_store.save_one(jkey, jdoc_m, False):
            raise Exception("update jd_measure pubTime Exception")
        if not self._page_store.save_one(json.dumps(page_store_key), json.dumps({"$set": {"isUpdated": 0}}), False):
            raise Exception("set page store isUpdated status Exception")

    def check_is_student_job(self, jd_raw):
        if jd_raw.jobWorkAge in [u"在读学生", u"应届毕业生", u"无经验", u"无要求"]\
                or jd_raw.jobCate in [u"应届毕业生", u"储备干部", u"培训生", u"兼职", u"临时", u"实习生"]\
                or jd_raw.jobType in [u"兼职", u"实习"]:
            print "student job ===> ", jd_raw.jdId

            expired = 0
            if self.process_item[jd_raw.jdId]['flag'] == 2:
                expired = 1

            self.rs_file.append_end_with('\t'.join((jd_raw.jdId, str(expired))))
            return True
        return False

    def parse_measure_jd(self, item):
        try:
            jd_store_key = json.dumps({'jdId': item})
            page_store_key = json.dumps({'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')})
            fileName = self.process_item[item].get("filePath")
            pageContent = self.getPageContent(fileName)

            jd_raw = self.parse_by_owner(item, pageContent, self.owner)
            jd_measure = self.measure(jd_raw)

            if self.test_mode:
                print "Raw ===> {}".format(jd_raw.to_json())
                print "Measure ===> {}".format(jd_measure.to_json())
                return

            if not self._raw_store.save_one(jd_store_key, jd_raw.to_json(), True):
                raise Exception("set raw exception")

            if not self._measure_store.save_one(jd_store_key, jd_measure.to_json(), True):
                raise Exception("set measure exception")

            if not self._page_store.save_one(page_store_key, json.dumps({"$set": {"status": 1}}), False):
                raise Exception("set page store status exception")
            self.check_is_student_job(jd_raw)
        except Exception as e:
            self.failfile.append_end_with(item)
            raise e

    def parse_by_owner(self, jdId, pageContent, owner):
        if "jd_lagou" == owner:
            jdRaw = JdLagouHtmlFind(pageContent).find_fields()
        elif "jd_51job" == owner:
            jdRaw = Jd51JobHtmlFind(pageContent).find_fields()
        elif "jd_zhilian" == owner:
            jdRaw = JdZhilianHtmlFind(pageContent).find_fields()
        else:
            raise Exception("unknown owner")

        jdRaw.jdId = jdId
        jdRaw.jdFrom = self.owner
        jdRaw.jdUrl = self.process_item[jdId].get('realUrl')

        if not jdRaw.pubDate:
            stamp = self.process_item[jdId].get("updateTime")
            str_uptime = time.strftime("%Y-%m-%d %H:%m:%S", time.localtime(stamp/1000))
            jdRaw.pubDate = str_uptime

        return jdRaw

    def measure(self, jd_raw):
        raw_for_measure = construct_jd_raw(jd_raw)
        jd_measure = self.jd_measure_client.measureJd(raw_for_measure)
        jdMeasureObj = convertToJdMeasure(self.owner, jd_measure)

        self.set_md5_SimHash(jdMeasureObj, jd_raw)

        return jdMeasureObj

    def set_md5_SimHash(self, jd_measure, jd_raw):
        jd_measure.jdMd5 = self.get_jd_md5(jd_measure, jd_raw)
        jd_measure.jdSimHash = gen_sim_hash(jd_raw.jobDescription)

    def get_jd_md5(self, jd_measure, jd_raw):

        temp = dict({})
        temp["jdPosition"] = jd_raw.jobPosition
        temp["incName"] = jd_measure.incSegmentId
        temp["jdWorkLoc"] = jd_measure.jobWorkLocId

        return get_jd_measure_hash(temp)

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg)
            return

        if "DONE" == evt:
            util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg)
            return
Beispiel #5
0
class JdExpireDetect(BaseTask):
    def __init__(self, channel):
        self.channel = channel
        BaseTask.__init__(self, "{}_expire_detect".format(channel))
        self.page_store = PageStore(channel)
        self.process_items = []
        self.page_expire_detect = PageExpireDetect()

        self.test_mode = False

    def pre_check(self, item):
        now = int(time.time() * 1000)
        # 只检测3天之内,没标记为过期的
        if "updateTime" in item and item["updateTime"] < now - 3 * 24 * 3600:
            if "isExpired" not in item or item['isExpired'] == 0:
                self.process_items.append({"indexUrl": item["indexUrl"], "realUrl": item["realUrl"]})

    def _load_data(self):
        for item in self.page_store.get_all():
            self.pre_check(item)
            if len(self.process_items) and len(self.process_items) % 10000 == 0:
                print "load {} items".format(len(self.process_items))

        print "totally load {} items".format(len(self.process_items))

    def dispatcher(self, q):
        for item in self.process_items:
            q.put(item)

        q.put(None)
        self.wait_q()

    def check_expire_by_channel(self, job, channel):
        if "jd_lagou" == channel:
            return self.page_expire_detect.lagou_page_detect(job["realUrl"])
        if "jd_51job" == channel:
            return self.page_expire_detect.jd51job_page_detect(job["realUrl"])
        if "jd_zhilian" == channel:
            return self.page_expire_detect.zhilian_page_detect(job["realUrl"])
        if "jd_wealink" == channel:
            return self.page_expire_detect.wealink_page_detect(job["realUrl"])
        if 'jd_liepin' == channel:
            return self.page_expire_detect.liepin_page_detect(job['realUrl'])

    def run_job(self, job):
        if not isinstance(job, dict):
            return
        try:
            if not self.check_expire_by_channel(job, self.channel):
                if not self.test_mode:
                    self.page_store.set_expire({"indexUrl": job["indexUrl"]})
                print "set expired , indexUrl: {}".format(job["indexUrl"])
        except Exception as e:
            print e
            print "failed , indexUrl: {}".format(job["indexUrl"])

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg)
            return

        if "DONE" == evt:
            util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg)
            return