コード例 #1
0
ファイル: etl_runner_base.py プロジェクト: haogods/etl_task
    def __init__(self, channel, is_bulk=False):
        self.channel = channel
        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.is_bulk = is_bulk

        if self.is_bulk:
            t = threading.Thread(target=self.do_bulk_upsert)
            t.start()
コード例 #2
0
class BugFixDispatcher(ETLDispatcherBase):
    def __init__(self, channel, q):
        ETLDispatcherBase.__init__(self, channel, q)
        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')

    def real_dispatcher(self, from_where):

        i = 0
        total_cnt = self.cv_raw_store.count_all()

        for doc in self.cv_raw_store.get_all():
            del doc['_id']
            self.queue.put(doc)
            i += 1
            if i % 10000 == 0:
                Logger.printProgress(i, total_cnt)
コード例 #3
0
ファイル: fetch_csv_sample.py プロジェクト: haogods/etl_task
    def __init__(self, channel, thread_cnt, need_cnt):
        BaseTask.__init__(self, "fetch_csv_sample", thread_cnt=thread_cnt)
        self.channel = channel
        self.rand = RandomDispatcher(channel, self._queue, need_cnt)
        self.dispatcher = lambda q: self.rand.dispatcher()

        self.dir_path = self.get_save_dir_path()
        CsvWriteBase.__init__(self, self.dir_path, self.channel)

        self.cv_measure_store = CVRawStore(channel, 'measure')
コード例 #4
0
ファイル: fetch_csv_sample.py プロジェクト: haogods/etl_task
class FetchCsvSample(BaseTask, CsvWriteBase):

    def __init__(self, channel, thread_cnt, need_cnt):
        BaseTask.__init__(self, "fetch_csv_sample", thread_cnt=thread_cnt)
        self.channel = channel
        self.rand = RandomDispatcher(channel, self._queue, need_cnt)
        self.dispatcher = lambda q: self.rand.dispatcher()

        self.dir_path = self.get_save_dir_path()
        CsvWriteBase.__init__(self, self.dir_path, self.channel)

        self.cv_measure_store = CVRawStore(channel, 'measure')

    def _load_data(self):
        self.rand.load_data()

    def get_save_dir_path(self):
        dir = os.path.join(os.path.dirname(__file__), '%s_%s_result' % (self.channel, self._name))
        if not os.path.exists(dir):
            os.mkdir(dir)
        return dir

    def get_pagecontent(self, cvId):
        doc = self.rand.cv_page_store.get_one(cvId)
        filepath = doc['pageContentPath']
        return self.rand.getPageContent(filepath, 'remote')

    def save_html(self, cvId, pagecontent):
        path = os.path.join(self.dir_path, '%s_html_result' % self.channel)
        if not os.path.exists(path):
            os.mkdir(path)

        with open('%s/%s.html' % (path, cvId.split("://")[1]), 'wb') as f:
            f.write(pagecontent)

    def run_job(self, job):
        cvId = job.get('cvId')
        pagecontent = self.get_pagecontent(cvId)
        self.save_html(cvId, pagecontent)

        measure_data = self.cv_measure_store.get_one(cvId)

        self.save(job, measure_data)
        print "SUCCESS COPIED %s" % cvId

    def end_operation(self, *args, **kwargs):

        print "***********************************" * 2
コード例 #5
0
ファイル: etl_base.py プロジェクト: haogods/etl_task
    def __init__(self, channel):
        self.channel = channel
        self.process_item = {}

        self.cv_page_store = CVPageStore(channel)
        self.cv_parser = CvParser()

        self.test_mode = False

        conf_path = os.path.join(os.path.dirname(__file__), '../../conf/thrift_conf.ini')
        self.thrift_client = ThriftClient(conf_path)

        # jobList 哈希, 用于判断重复
        self.cv_hash_table = CVHashTable()
        self._duplication_count = 0

        self.rlock = threading.RLock()

        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.measure_client = self.thrift_client.cv_measure_server_client

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.fail_save = FileSave('%s_fail_ids.txt' % self.channel)
        self.parsed_cnt = 0
        self.parsed_cnt_lock = threading.RLock()

        # 二进制文件位置, 设置
        # local 表示在本地
        # remote 表示在远程
        self.bin_file_location = os.environ['BIN_FILE_LOCATION']

        if self.bin_file_location == 'remote':
            self.bin_read_client = self.thrift_client.bin_read_server_client
コード例 #6
0
ファイル: etl_dipatcher.py プロジェクト: haogods/etl_task
class ETLDisFromRawAfterCheck(ETLDisFromRaw):
    '''
        检查解析表和量化表数据一致性,

        这个是粗略处理: 解析表中有的,如果量化表没有,重新解析量化
    '''

    def __init__(self, channel, q):
        ETLDisFromRaw.__init__(self, channel, q)
        self.cv_measure_store = CVRawStore(channel, 'measure')

    def real_dispatcher(self, from_which):
        index = 0
        for item in self.cv_raw_store.get_all():
            index += 1
            if not self.cv_measure_store.find_one({'cvId':item['cvId']}):
                self.queue.put(item)

            if index % 10000 == 0:
                print "copied %d items" % index
コード例 #7
0
ファイル: etl_runner_base.py プロジェクト: haogods/etl_task
    def __init__(self, channel):
        self.channel = channel
        self.r_lock = threading.RLock()
        self._duplication_count = 0
        self.cv_hash_table = CVHashTable()

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.dupli_file_save = open(os.path.join(os.path.dirname(__file__),
                                                 "../result/%s_duplicate_ids_%d" % (self.channel, os.getpid())), 'wb')
コード例 #8
0
 def __init__(self, channel, q):
     ETLDispatcherBase.__init__(self, channel, q)
     self.cv_raw_store = CVRawStore(self.channel, stage='raw')
     self.cv_measure_store = CVRawStore(self.channel, stage='measure')
コード例 #9
0
ファイル: etl_dipatcher.py プロジェクト: haogods/etl_task
class ETLDispatcher(ETLDispatcherBase):
    def __init__(self, channel, q):
        ETLDispatcherBase.__init__(self, channel, q)
        self.cv_raw_store = CVRawStore(self.channel, stage='raw')

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        self.process_item[indexUrl] = {
            'updateTime': updateTime,
            'contentSign': contentSign,
            'realUrl': realUrl,
            'filePath': filePath,
            'flag': flag,
        }

        if len(self.process_item)%10000 == 0:
            print "load items: %d" % len(self.process_item)

    def check_and_put(self, item):

        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        realUrl = item.get('realUrl')

        self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0)

    def real_dispatcher(self, from_which):
        if from_which == 'db':
            self.load_data()
            self.dispatcher_from_db()
        # elif from_which == 'file':
        #     self.dispatcher_from_file()
        else:
            raise Exception("unknown from_which")

    def exist_in_raw(self, indexUrl):
        if self.cv_raw_store.get_one(indexUrl):
            return True

        return False

    def load_data(self):
        for item in self.cv_page_store.get_all():
            # if self.exist_in_raw(item['indexUrl']):
            #     continue
            self.check_and_put(item)

        print "============= totally load %d items ===============" % len(self.process_item)

    def dispatcher_from_db(self):

        i = 0
        total_cnt = len(self.process_item)
        for item in self.process_item:
            pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location)
            self.queue.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign':
                   self.process_item[item]['contentSign']})

            i += 1
            if i % 10000 == 0:
                print "processed %f%%" % (float(i*100/total_cnt))

        self.queue.put(None)
コード例 #10
0
ファイル: etl_dipatcher.py プロジェクト: haogods/etl_task
 def __init__(self, channel, q):
     ETLDisFromRaw.__init__(self, channel, q)
     self.cv_measure_store = CVRawStore(channel, 'measure')
コード例 #11
0
ファイル: etl_base.py プロジェクト: haogods/etl_task
class ETLBase(object):
    def __init__(self, channel):
        self.channel = channel
        self.process_item = {}

        self.cv_page_store = CVPageStore(channel)
        self.cv_parser = CvParser()

        self.test_mode = False

        conf_path = os.path.join(os.path.dirname(__file__), '../../conf/thrift_conf.ini')
        self.thrift_client = ThriftClient(conf_path)

        # jobList 哈希, 用于判断重复
        self.cv_hash_table = CVHashTable()
        self._duplication_count = 0

        self.rlock = threading.RLock()

        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.measure_client = self.thrift_client.cv_measure_server_client

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.fail_save = FileSave('%s_fail_ids.txt' % self.channel)
        self.parsed_cnt = 0
        self.parsed_cnt_lock = threading.RLock()

        # 二进制文件位置, 设置
        # local 表示在本地
        # remote 表示在远程
        self.bin_file_location = os.environ['BIN_FILE_LOCATION']

        if self.bin_file_location == 'remote':
            self.bin_read_client = self.thrift_client.bin_read_server_client

    def _inc_parsed_cnt(self):
        with self.parsed_cnt_lock:
            self.parsed_cnt += 1

    def _inc_duplication_count(self):
        with self.rlock:
            self._duplication_count += 1

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        self.process_item[indexUrl] = {
            'updateTime': updateTime,
            'contentSign': contentSign,
            'realUrl': realUrl,
            'filePath': filePath,
            'flag': flag,
        }

        if len(self.process_item)%10000 == 0:
            print "load items: %d" % len(self.process_item)

    def store(self, cvId, raw, measure, bulk=False):
        if self.test_mode:
            print "====================test mode cvId: %s================" % cvId
            print "raw: ", raw.to_json()
            return
        key = {"cvId": cvId}
        if not bulk:
            self.cv_raw_store.save_one(key, raw.to_json(), isUpsert=True)
            self.cv_measure_store.save_one(key, measure.to_json(), isUpsert=True)
        else:
            self.cv_raw_store.bulk_upsert(key, raw.to_mongo())
            self.cv_raw_store.bulk_upsert(key, measure.to_mongo())

    def measure(self, raw):
        return self.measure_client.measureCv(raw)

    def check_and_put(self, item):

        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        realUrl = item.get('realUrl')

        self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0)

    def _load_data(self):
        for item in self.cv_page_store.get_all():
            self.check_and_put(item)

        print "============= totally load %d items ===============" % len(self.process_item)

    def dispatcher(self, q, from_which='db'):
        if from_which == 'db':
            self.dispatcher_from_db(q)
        elif from_which == 'file':
            self.dispatcher_from_file(q)
        else:
            raise Exception("unknown from_which")

    def dispatcher_from_db(self, q):
        i = 0
        total_cnt = len(self.process_item)
        for item in self.process_item:
            pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location)
            q.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign':
                   self.process_item[item]['contentSign']})

            i += 1
            if i % 10000 == 0:
                print "processed %f%%" % (float(i*100/total_cnt))

        q.put(None)

    def dispatcher_from_file(self, q):
        with open('%s_need_fix_ids.txt', 'rb') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                doc = self.cv_page_store.get_one(line)
                pagecontent = self.getPageContent(doc.get('pageContentPath'), self.bin_file_location)
                q.put({'indexUrl': line, 'pagecontent': pagecontent, 'updateTime':doc['updateTime'], 'contentSign':
                   doc['contentSign']})

        q.put(None)

    def getPageContent(self, filename, from_where='local'):
        if from_where == 'local':
            parts = filename.split("::")
            if len(parts) == 3:
                binReader = BinReader(parts[1])
                _, content = binReader.readone_at(int(parts[2]))
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content

            if len(parts) == 1:
                with open(filename) as f:
                    content = f.read()
                    if len(content) == 0:
                        raise Exception("file name:{} , content error".format(filename))
                    return content
        elif from_where == 'remote':
            #TODO
            # 从远程获取bin文件内容
            content = self.bin_read_client.getHtml(filename)
            return content

        else:
            raise Exception("unknown from_where")

    def check_has_duplication(self, indexUrl, ps):
        for s in ps.get('jobList', []):
            incName = s.get('incName', '')
            jobPosition = s.get('jobPosition', '')
            jobDesc = s.get('jobDesc', '')

            # 都不为空, 才会判重
            if not (incName and jobPosition and jobDesc):
                return False

            hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True)
            key = {'hash_value': hash_value}
            hash_doc = self.cv_hash_table.get_one(key)
            if hash_doc:
                # 统计重复数
                self._inc_duplication_count()

                # 如果此渠道优先级比较大, 替换掉存在hash表中的
                cvId_in_db = hash_doc.get('cvId')
                # 相同Id, 可能更新
                if cvId_in_db == indexUrl:
                    return False

                cv_channel_in_db = cvId_in_db.split('://')[0]
                if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0):
                    hash_doc['cvId'] = indexUrl
                    self.cv_hash_table.upsert_one(key, hash_doc)
                    # remove 优先级低的, 保持解析数据没有重复
                    self.remove_duplication(cvId_in_db, cv_channel_in_db)

                    return False
                return True
            else:
                hash_doc = {'hash_value': hash_value, 'cvId': indexUrl}
                self.cv_hash_table.upsert_one(key, hash_doc)

    def remove_duplication(self, cvId, channel):
        key = {'cvId': cvId}
        if channel == 'cv_51job':
            self.cv_51job_raw_store.remove_one(key)
            self.cv_51job_measure_store.remove_one(key)
        elif channel == 'cv_zhilian':
            self.cv_zhilian_raw_store.remove_one(key)
            self.cv_zhilian_measure_store.remove_one(key)

        elif channel == 'cv_liepin':
            self.cv_liepin_raw_store.remove_one(key)
            self.cv_liepin_measure_store.remove_one(key)

        else:
            raise Exception('unknown channel')

    def real_run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        #print pagecontent
        ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)

        # 判断是否重复cv
        if self.check_has_duplication(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        cvRaw_obj.cvId = job.get('indexUrl')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')
        cvRaw_obj.contentSign = job.get('contentSign')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure(cvRaw)

        if not cvMeasure:
            print cvRaw

        cvMeasured_obj = constructCvMeasureObj(cvMeasure)

        self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            util.send_email(["<*****@*****.**>"], "{} 样本对比".format(self.channel), msg + '\n duplicate cvs: %d' % self._duplication_count +
                            '\n parsed count: %d' % self.parsed_cnt)
コード例 #12
0
def get_measure_count(channel):
    store = CVRawStore(channel, stage="measure")
    return store.count_all()
コード例 #13
0
ファイル: etl_runner_base.py プロジェクト: haogods/etl_task
class CheckCvDupliRunner(object):
    def __init__(self, channel):
        self.channel = channel
        self.r_lock = threading.RLock()
        self._duplication_count = 0
        self.cv_hash_table = CVHashTable()

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.dupli_file_save = open(os.path.join(os.path.dirname(__file__),
                                                 "../result/%s_duplicate_ids_%d" % (self.channel, os.getpid())), 'wb')

    def _inc_duplication_count(self, indexUrl):
        with self.r_lock:
            self._duplication_count += 1

    def remove_duplication(self, cvId_to_remove, channel, indexUrl):
        key = {'cvId': cvId_to_remove}

        if channel == 'cv_51job':
            self.cv_51job_raw_store.delete_one(key)
            self.cv_51job_measure_store.delete_one(key)
        elif channel == 'cv_zhilian':
            self.cv_zhilian_raw_store.delete_one(key)
            self.cv_zhilian_measure_store.delete_one(key)

        elif channel == 'cv_liepin':
            self.cv_liepin_raw_store.delete_one(key)
            self.cv_liepin_measure_store.delete_one(key)

        else:
            raise Exception('unknown channel')

        print >> self.dupli_file_save, "[D]cvId: %s is removed for duplicating with cvId: %s" % (cvId_to_remove, indexUrl)

    def run(self, indexUrl, ps):
        for s in ps.get('jobList', []):
            incName = s.get('incName', '')
            jobPosition = s.get('jobPosition', '')
            jobDesc = s.get('jobDesc', '')

            # 都不为空, 才会判重
            if not (incName and jobPosition and jobDesc):
                return False

            hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True)
            key = {'hash_value': hash_value}
            hash_doc = self.cv_hash_table.find_one(key)
            if hash_doc:
                # 统计重复数
                self._inc_duplication_count(indexUrl)

                # 如果此渠道优先级比较大, 替换掉存在hash表中的
                cvId_in_db = hash_doc.get('cvId')
                # 相同Id, 可能更新
                if cvId_in_db == indexUrl:
                    return False

                cv_channel_in_db = cvId_in_db.split('://')[0]
                if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0):
                    hash_doc['cvId'] = indexUrl
                    self.cv_hash_table.save_one(key, hash_doc, True)
                    # remove 优先级低的, 保持解析数据没有重复
                    self.remove_duplication(cvId_in_db, cv_channel_in_db, indexUrl)
                    return False
                else:
                    self.remove_duplication(indexUrl, indexUrl.split('://')[0], cvId_in_db)

                    return True
            else:
                hash_doc = {'hash_value': hash_value, 'cvId': indexUrl}
                self.cv_hash_table.save_one(key, hash_doc, True)
                return False

        return False
コード例 #14
0
ファイル: etl_runner_base.py プロジェクト: haogods/etl_task
class StoreRunner(object):
    def __init__(self, channel, is_bulk=False):
        self.channel = channel
        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.is_bulk = is_bulk

        if self.is_bulk:
            t = threading.Thread(target=self.do_bulk_upsert)
            t.start()

    def do_bulk_upsert(self):
        while True:
            time.sleep(random.randint(300, 3000))
            print "do upserting..."
            self.cv_raw_store.bulk_upsert(None, None)
            self.cv_measure_store.bulk_upsert(None, None)
            print "do upserting ok"

    def run(self, cv_id, cv_raw, cv_measure):
        key = {"cvId": cv_id}

        if not self.is_bulk:
            if cv_raw:
                self.cv_raw_store.save_one(key, cv_raw.to_json(), isUpsert=True)
            if cv_measure:
                self.cv_measure_store.save_one(key, cv_measure.to_json(), isUpsert=True)
        else:
            if cv_raw:
                self.cv_raw_store.bulk_upsert(key, cv_raw.to_mongo())
            if cv_measure:
                self.cv_measure_store.bulk_upsert(key, cv_measure.to_mongo())