class FixCvField(BaseTask): def __init__(self, channel, field, field_pattern): BaseTask.__init__(self, "fix_cv_field") self.channel = channel self.field = field self.field_pattern = field_pattern self.bugfix = BugFixDispatcher(channel, self._queue) self.dispatcher = lambda q: self.bugfix.dispatcher() self.etl_runner = ETLRunnerFromRaw(channel) def run_job(self, job): try: if not isinstance(job, dict): return split_field = self.field.split('.') if len(split_field) == 2: field_1 = getattr(job, split_field[0]) if isinstance(field_1, list): for item in field_1: field_2 = getattr(item, split_field[1]) assert isinstance(field_2, basestring) if re.search(self.field_pattern, field_2): # 匹配模式,重新量化 print "cvId: ", job.get('cvId'), " || ", print self.field_pattern, " : ", field_2, " || ", self.etl_runner.run(job) except Exception as e: traceback.print_exc()
class FixCv(BaseTask): def __init__(self, channel, thread_cnt=5): BaseTask.__init__(self, "fixbug", thread_cnt=thread_cnt) self.chanel = channel self.bugfix = BugFixDispatcher(channel, self._queue) self.dispatcher = lambda q: self.bugfix.dispatcher() self.etl_runner = ETLRunner(channel) def run_job(self, job): if not isinstance(job, dict): return jobExpSalary = job.get('jobExp',{}).get('expSalary', '').encode('utf-8') if not jobExpSalary: return find = re.search(r'(\d+)', jobExpSalary) if find: return else: if '面议' in jobExpSalary: print job.get('cvId'), " ==> ", "面议, 略过" return page_index = self.get_page_index(job.get('cvId', '')) print job.get('cvId'), " ==> ", jobExpSalary self.etl_runner.run(page_index) print "SUCESS COPIED %s" % job.get('cvId') def get_page_index(self, cvId): doc = self.bugfix.cv_page_store.get_one(cvId) pagecontent = self.bugfix.getPageContent(doc['pageContentPath'],'remote') if not pagecontent: print "cvId : %s, pagecontent null" % cvId job = { 'indexUrl': doc['indexUrl'], 'pagecontent': pagecontent, 'updateTime':doc['updateTime'], 'contentSign':doc['contentSign'], } return job
class GetsRaw(BaseTask, CsvWriteBase): def __init__(self, channel, thread_cnt=5): BaseTask.__init__(self, "gets_raw", thread_cnt=thread_cnt) self.rawdispatcher = BugFixDispatcher(channel, self._queue) self.dispatcher = lambda q: self.rawdispatcher.dispatcher() self.channel = channel self.dir_path = self.get_save_file_dir() CsvWriteBase.__init__(self, self.dir_path, self.channel) def get_save_file_dir(self): _dir = os.path.join(os.path.dirname(__file__), '%s_%s_result' % (self.channel, self._name)) if not os.path.exists(_dir): os.mkdir(_dir) return _dir def is_need(self, doc): nowInc = doc.get('baseInfo').get('nowInc', '') if not nowInc: return False if isinstance(nowInc, str): nowInc = nowInc.decode('utf-8') if u'北大纵横' in nowInc or u'温氏集团' in nowInc or u'金诚信矿业管理' in nowInc: print doc.get('cvId'), ' ===> ', nowInc.encode('utf-8') return True if doc.get('jobList', []): for jobItem in doc.get('jobList'): incName = jobItem['incName'] if isinstance(incName, str): incName = incName.decode('utf-8') if u'北大纵横' in incName or u'温氏集团' in incName or u'金诚信矿业管理' in incName: return True return False def run_job(self, job): if self.is_need(job): self.save(job, None) print 'Copied cvId: ', job.get('cvId')