Beispiel #1
0
class ParseRunner(object):
    def __init__(self, channel, http_service_addr=None):
        self.channel = channel
        self.http_service_addr = http_service_addr
        self.cv_parser = CvParser()

    def run(self, pagecontent):
        if not self.http_service_addr:
            return self.cv_parser.parser(pagecontent, cvFrom=self.channel)
        else:
            result = requests.post(self.http_service_addr, data={'channel': self.channel, 'pagecontent': pagecontent})
            return json.loads(result.text,encoding='utf-8')
Beispiel #2
0
    def __init__(self, channel):
        self.channel = channel
        self.process_item = {}

        self.cv_page_store = CVPageStore(channel)
        self.cv_parser = CvParser()

        self.test_mode = False

        conf_path = os.path.join(os.path.dirname(__file__), '../../conf/thrift_conf.ini')
        self.thrift_client = ThriftClient(conf_path)

        # jobList 哈希, 用于判断重复
        self.cv_hash_table = CVHashTable()
        self._duplication_count = 0

        self.rlock = threading.RLock()

        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.measure_client = self.thrift_client.cv_measure_server_client

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.fail_save = FileSave('%s_fail_ids.txt' % self.channel)
        self.parsed_cnt = 0
        self.parsed_cnt_lock = threading.RLock()

        # 二进制文件位置, 设置
        # local 表示在本地
        # remote 表示在远程
        self.bin_file_location = os.environ['BIN_FILE_LOCATION']

        if self.bin_file_location == 'remote':
            self.bin_read_client = self.thrift_client.bin_read_server_client
Beispiel #3
0
    def __init__(self, channel, prefix=None):
        BaseTask.__init__(self, channel, thread_cnt=2)
        self.channel = channel
        self.cv_parser = CvParser()
        self._cvs_data = {}

        #对比 临时存放文件
        self.diff_rs_dir = '%s_rs_diff' % self.channel if not prefix else '%s/%s_job_diff' % (prefix, self.channel)

        #机器解析
        self.mechine_rs_dir = '%s_mechine_parsed' % self.channel if not prefix else '%s/%s_mechine_parsed' % (prefix, self.channel)

        #人工解析
        self.person_rs_dir = '%s_person_parsed' % self.channel if not prefix else '%s/%s_person_parsed' % (prefix, self.channel)

        #结果
        self.result_file = '%s_result.txt' % self.channel if not prefix else '%s/%s_result.txt' % (prefix, self.channel)
        # util.check_and_clear(os.path.dirname(self.result_file))
        self.excel_save = ExcelFileSave(self.result_file)

        #样本
        self.sample_dir = '%s_sample' % self.channel if not prefix else '%s/%s_sample' % (prefix, self.channel)
Beispiel #4
0
class CvPS(BaseTask):
    def __init__(self, channel, prefix=None):
        BaseTask.__init__(self, channel, thread_cnt=2)
        self.channel = channel
        self.cv_parser = CvParser()
        self._cvs_data = {}

        #对比 临时存放文件
        self.diff_rs_dir = '%s_rs_diff' % self.channel if not prefix else '%s/%s_job_diff' % (prefix, self.channel)

        #机器解析
        self.mechine_rs_dir = '%s_mechine_parsed' % self.channel if not prefix else '%s/%s_mechine_parsed' % (prefix, self.channel)

        #人工解析
        self.person_rs_dir = '%s_person_parsed' % self.channel if not prefix else '%s/%s_person_parsed' % (prefix, self.channel)

        #结果
        self.result_file = '%s_result.txt' % self.channel if not prefix else '%s/%s_result.txt' % (prefix, self.channel)
        # util.check_and_clear(os.path.dirname(self.result_file))
        self.excel_save = ExcelFileSave(self.result_file)

        #样本
        self.sample_dir = '%s_sample' % self.channel if not prefix else '%s/%s_sample' % (prefix, self.channel)

    def start_operation(self, *args, **kwargs):
        # 清理准备工作
        print os.getcwd()
        for f in [self.diff_rs_dir, self.mechine_rs_dir]:
            if not os.path.isdir(f):
                os.system('mkdir -p %s' % f)
            else:
                os.system('rm -rf %s/*' % f)

    def dispatcher(self):
        fs = os.listdir(self.sample_dir)
        for file in fs:

            real_file = "%s/%s" % (self.sample_dir, file)
            if not os.path.isfile(real_file) or \
                os.path.splitext(real_file)[1] != '.html':
                continue

            fname = os.path.split(real_file)[1]
            cvId = os.path.splitext(fname)[0]
            self._queue.put({'fn': real_file, 'cvId': cvId})

    def run_job(self, job):

        if not job:
            return

        htmlfile = job.get('fn')
        cvId = job.get('cvId')

        with open(htmlfile, 'rb') as f:
            pagecontent = f.read()

        try:
            ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)
            cvRaw_obj = constructCvRawObj(ps)

            cvRaw_obj.cvId = "%s://%s" % (self.channel, cvId)
            cvRaw_obj.cvFrom = self.channel

            self._check_fields(cvRaw_obj.to_json(), cvId)
            # Logger.default_log("cvId: %s Ok" % cvId)
        except Exception as e:
            Logger.default_log("cvId: %s Fail" % cvId)
            traceback.print_exc()

    def _load_data(self):

        with open('%s/%s' % (self.person_rs_dir, 'parsed.csv'), 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            for index, row in enumerate(reader):
                if index == 0:
                    continue
                cvId = row[0].split('.')[0]
                if '简历ID:' in cvId:
                    cvId = cvId.replace('简历ID:', '')
                parsed_data = row[1].replace('\n','')

                self._cvs_data.update({cvId: parsed_data})

    def _check_fields(self, cvRaw_obj, cvId):

        cvRaw_obj = json.loads(cvRaw_obj)
        person_parsed = self._cvs_data.get(cvId, '')
        if not person_parsed:
            return

        person_parsed = json.loads(person_parsed)

        print person_parsed

        diff_rs = copy.deepcopy(person_parsed)
        for key, value in person_parsed.items():

            if key == 'cvFrom':
                diff_rs['cvFrom'] = 0
                continue

            if key == 'baseInfo':
                for key1, value2 in value.items():
                    # 忽略cvId
                    if key1 == 'cvId':
                        diff_rs['baseInfo'][key1] = 0
                        continue

                    if value2 in [None,'None',''] and not cvRaw_obj['baseInfo'].get(key1, ''):
                        diff_rs['baseInfo'][key1] = 0
                        continue

                    if value2 != cvRaw_obj['baseInfo'].get(key1, ''):
                        diff_rs['baseInfo'][key1] = 1  # 解析不相同
                    else:
                        diff_rs['baseInfo'][key1] = 0  # 解析一样

            elif isinstance(value, (str, unicode)):
                diff_rs[key] = 0 if person_parsed[key] == cvRaw_obj.get(key, '') else 1

            elif key in ['languageList', 'proList', 'skillList', 'trainList', 'jobList', 'eduList', 'certList']:
                if not person_parsed[key]:
                    if not cvRaw_obj.get(key, ''):
                        diff_rs[key] = 0
                    else:
                        diff_rs[key] = 1
                    continue

                for index, languageItem in enumerate(person_parsed[key]):
                    # 为空
                    if not cvRaw_obj.get(key, ''):
                        diff_rs[key] = 1
                        break
                    elif cvRaw_obj.get(key, '') and len(cvRaw_obj.get(key)) < len(person_parsed[key]):
                        diff_rs[key] = 1
                        break
                    for key1, value1 in languageItem.items():

                        if cvRaw_obj[key][index].get(key1, '') != value1 and key1 != 'positionList':
                            diff_rs[key][index][key1] = 1 # 解析不同

                        elif key1 == 'positionList':
                            for positionIndex, positionItem in enumerate(value1):
                                if len(cvRaw_obj[key][index].get(key1, [])) < len(value1):
                                    diff_rs[key][index][key1] = 1
                                    continue

                                for key2, value2 in positionItem.items():
                                    diff_rs[key][index][key1][positionIndex][key2] = 0 if cvRaw_obj[key][index][key1][positionIndex].get(key2,'') == value2 else 1


                        else:
                            diff_rs[key][index][key1] = 0

            elif key in ['jobList']:
                if not person_parsed[key]:
                    if not cvRaw_obj.get(key, ''):
                        diff_rs[key] = 0
                    else:
                        diff_rs[key] = 1
                    continue





            elif key in ['others', 'privateInfo', 'jobExp']:
                if not person_parsed[key] or person_parsed in ['None', None]:
                    if not cvRaw_obj.get(key, ''):
                        diff_rs[key] = 0
                    else:
                        diff_rs[key] = 1

                for key1, value1 in person_parsed[key].items():
                    if value1 in ['None', None, '']:
                        if not cvRaw_obj[key].get(key1, ''):
                            diff_rs[key][key1] = 0
                        else:
                            diff_rs[key][key1] = 1

                        continue

                    if cvRaw_obj[key].get(key1) != value1:
                        diff_rs[key][key1] = 1 # 解析错误
                    else:
                        diff_rs[key][key1] = 0

        self._save_diff_rs(cvId, diff_rs)
        self._save_mechine_parsed(cvId, cvRaw_obj)
        self._save_people_parsed(cvId, person_parsed)
        # Logger.default_log('complete cvId: %s' % cvId)

        self._save_excel(cvId, person_parsed, cvRaw_obj, diff_rs)

    def _save_mechine_parsed(self, cvId, cvRaw_obj):
        with codecs.open("%s/%s.json" % (self.mechine_rs_dir, cvId), 'wb', encoding='utf-8') as f:
            f.write(json.dumps(cvRaw_obj, indent=4, ensure_ascii=False))

    def _save_people_parsed(self, cvId, person_parsed):
        with codecs.open("%s/%s.json" % (self.person_rs_dir, cvId), 'wb', encoding='utf-8') as f:
            f.write(json.dumps(person_parsed, indent=4, ensure_ascii=False))

    def _save_diff_rs(self, cvId, diff_rs):
        with open("%s/%s.json" % (self.diff_rs_dir, cvId), 'wb') as f:
            f.write(json.dumps(diff_rs, indent=4))

    def _save_excel(self, cvid, person_parsed, mechine_obj, diff_rs):
        self.excel_save.append([cvid])
        for key, value in diff_rs.items():
            if isinstance(value, (str, unicode, int)):
                self.excel_save.append(['', key, person_parsed[key], mechine_obj.get(key, ''), diff_rs[key]])

            if isinstance(value, dict):
                for key1, value2 in value.items():
                    if key1 == 'cvId':
                        continue

                    if isinstance(value2, (str, unicode, int)):
                        self.excel_save.append(['',"%s.%s"%(key, key1), person_parsed[key][key1], mechine_obj[key].get(key1,''), diff_rs[key][key1]])
                    else:
                        print type(value2)
                        raise Exception("%s.%s"%(key, key1))

            if isinstance(value, list):
                for index1, item1 in enumerate(value):
                    if isinstance(item1, dict):
                        for key2, value2 in item1.items():
                            if isinstance(value2, list):
                                for index2, item2 in enumerate(value2):
                                    if isinstance(item2, dict):
                                        for key3, value3 in item2.items():
                                            mechine_value = mechine_obj[key][index1].get(key2, '')
                                            if mechine_value and len(mechine_value) <= index2:
                                                mechine_value =0
                                            else:
                                                mechine_value = mechine_value[index2].get(key3, '')

                                            self.excel_save.append(['',"%s.%d.%s.%d.%s" % (key, index1, key2, index2, key3), person_parsed[key][index1][key2][index2][key3], mechine_value, diff_rs[key][index1][key2][index2][key3]])

                            else:
                                self.excel_save.append(['',"%s.%d.%s" % (key, index1, key2), person_parsed[key][index1][key2], mechine_obj[key][index1].get(key2, ''), diff_rs[key][index1][key2]])

                    else:
                        raise Exception("%s.%d" % (key, index1))

        self.excel_save.append([''])

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            util.send_email(["<*****@*****.**>"], "{} 样本对比".format(self.channel), msg)

            # 最终结果存放文件
            filedest = 'app/share/%s_result.txt' % self.channel

            if os.path.exists(filedest):
                now = util.get_date_with_day_duration()
                history_fn = os.path.join(os.path.dirname(filedest), '%s_%s.txt' % (self.channel, '%d%02d%02d%02d%02d' % (now.year, now.month, now.day, now.hour, now.minute)))

                # 将结果移动到历史文件
                os.system('mv %s %s' % (filedest, history_fn))

            # 将最后计算结果放到share 目录下, 提供下载
            os.system('mv %s %s' % (self.result_file, filedest))
Beispiel #5
0
class ETLBase(object):
    def __init__(self, channel):
        self.channel = channel
        self.process_item = {}

        self.cv_page_store = CVPageStore(channel)
        self.cv_parser = CvParser()

        self.test_mode = False

        conf_path = os.path.join(os.path.dirname(__file__), '../../conf/thrift_conf.ini')
        self.thrift_client = ThriftClient(conf_path)

        # jobList 哈希, 用于判断重复
        self.cv_hash_table = CVHashTable()
        self._duplication_count = 0

        self.rlock = threading.RLock()

        self.cv_raw_store = CVRawStore(self.channel, stage='raw')
        self.cv_measure_store = CVRawStore(self.channel, stage='measure')
        self.measure_client = self.thrift_client.cv_measure_server_client

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.fail_save = FileSave('%s_fail_ids.txt' % self.channel)
        self.parsed_cnt = 0
        self.parsed_cnt_lock = threading.RLock()

        # 二进制文件位置, 设置
        # local 表示在本地
        # remote 表示在远程
        self.bin_file_location = os.environ['BIN_FILE_LOCATION']

        if self.bin_file_location == 'remote':
            self.bin_read_client = self.thrift_client.bin_read_server_client

    def _inc_parsed_cnt(self):
        with self.parsed_cnt_lock:
            self.parsed_cnt += 1

    def _inc_duplication_count(self):
        with self.rlock:
            self._duplication_count += 1

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        self.process_item[indexUrl] = {
            'updateTime': updateTime,
            'contentSign': contentSign,
            'realUrl': realUrl,
            'filePath': filePath,
            'flag': flag,
        }

        if len(self.process_item)%10000 == 0:
            print "load items: %d" % len(self.process_item)

    def store(self, cvId, raw, measure, bulk=False):
        if self.test_mode:
            print "====================test mode cvId: %s================" % cvId
            print "raw: ", raw.to_json()
            return
        key = {"cvId": cvId}
        if not bulk:
            self.cv_raw_store.save_one(key, raw.to_json(), isUpsert=True)
            self.cv_measure_store.save_one(key, measure.to_json(), isUpsert=True)
        else:
            self.cv_raw_store.bulk_upsert(key, raw.to_mongo())
            self.cv_raw_store.bulk_upsert(key, measure.to_mongo())

    def measure(self, raw):
        return self.measure_client.measureCv(raw)

    def check_and_put(self, item):

        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        realUrl = item.get('realUrl')

        self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0)

    def _load_data(self):
        for item in self.cv_page_store.get_all():
            self.check_and_put(item)

        print "============= totally load %d items ===============" % len(self.process_item)

    def dispatcher(self, q, from_which='db'):
        if from_which == 'db':
            self.dispatcher_from_db(q)
        elif from_which == 'file':
            self.dispatcher_from_file(q)
        else:
            raise Exception("unknown from_which")

    def dispatcher_from_db(self, q):
        i = 0
        total_cnt = len(self.process_item)
        for item in self.process_item:
            pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location)
            q.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign':
                   self.process_item[item]['contentSign']})

            i += 1
            if i % 10000 == 0:
                print "processed %f%%" % (float(i*100/total_cnt))

        q.put(None)

    def dispatcher_from_file(self, q):
        with open('%s_need_fix_ids.txt', 'rb') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                doc = self.cv_page_store.get_one(line)
                pagecontent = self.getPageContent(doc.get('pageContentPath'), self.bin_file_location)
                q.put({'indexUrl': line, 'pagecontent': pagecontent, 'updateTime':doc['updateTime'], 'contentSign':
                   doc['contentSign']})

        q.put(None)

    def getPageContent(self, filename, from_where='local'):
        if from_where == 'local':
            parts = filename.split("::")
            if len(parts) == 3:
                binReader = BinReader(parts[1])
                _, content = binReader.readone_at(int(parts[2]))
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content

            if len(parts) == 1:
                with open(filename) as f:
                    content = f.read()
                    if len(content) == 0:
                        raise Exception("file name:{} , content error".format(filename))
                    return content
        elif from_where == 'remote':
            #TODO
            # 从远程获取bin文件内容
            content = self.bin_read_client.getHtml(filename)
            return content

        else:
            raise Exception("unknown from_where")

    def check_has_duplication(self, indexUrl, ps):
        for s in ps.get('jobList', []):
            incName = s.get('incName', '')
            jobPosition = s.get('jobPosition', '')
            jobDesc = s.get('jobDesc', '')

            # 都不为空, 才会判重
            if not (incName and jobPosition and jobDesc):
                return False

            hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True)
            key = {'hash_value': hash_value}
            hash_doc = self.cv_hash_table.get_one(key)
            if hash_doc:
                # 统计重复数
                self._inc_duplication_count()

                # 如果此渠道优先级比较大, 替换掉存在hash表中的
                cvId_in_db = hash_doc.get('cvId')
                # 相同Id, 可能更新
                if cvId_in_db == indexUrl:
                    return False

                cv_channel_in_db = cvId_in_db.split('://')[0]
                if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0):
                    hash_doc['cvId'] = indexUrl
                    self.cv_hash_table.upsert_one(key, hash_doc)
                    # remove 优先级低的, 保持解析数据没有重复
                    self.remove_duplication(cvId_in_db, cv_channel_in_db)

                    return False
                return True
            else:
                hash_doc = {'hash_value': hash_value, 'cvId': indexUrl}
                self.cv_hash_table.upsert_one(key, hash_doc)

    def remove_duplication(self, cvId, channel):
        key = {'cvId': cvId}
        if channel == 'cv_51job':
            self.cv_51job_raw_store.remove_one(key)
            self.cv_51job_measure_store.remove_one(key)
        elif channel == 'cv_zhilian':
            self.cv_zhilian_raw_store.remove_one(key)
            self.cv_zhilian_measure_store.remove_one(key)

        elif channel == 'cv_liepin':
            self.cv_liepin_raw_store.remove_one(key)
            self.cv_liepin_measure_store.remove_one(key)

        else:
            raise Exception('unknown channel')

    def real_run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        #print pagecontent
        ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)

        # 判断是否重复cv
        if self.check_has_duplication(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        cvRaw_obj.cvId = job.get('indexUrl')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')
        cvRaw_obj.contentSign = job.get('contentSign')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure(cvRaw)

        if not cvMeasure:
            print cvRaw

        cvMeasured_obj = constructCvMeasureObj(cvMeasure)

        self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            util.send_email(["<*****@*****.**>"], "{} 样本对比".format(self.channel), msg + '\n duplicate cvs: %d' % self._duplication_count +
                            '\n parsed count: %d' % self.parsed_cnt)
Beispiel #6
0
 def __init__(self, channel, http_service_addr=None):
     self.channel = channel
     self.http_service_addr = http_service_addr
     self.cv_parser = CvParser()