Exemple #1
0
class FixCv(BaseTask):
    def __init__(self, channel, thread_cnt=5):
        BaseTask.__init__(self, "fixbug", thread_cnt=thread_cnt)
        self.chanel = channel
        self.bugfix = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.bugfix.dispatcher()

        self.etl_runner = ETLRunner(channel)

    def run_job(self, job):
        if not isinstance(job, dict):
            return

        jobExpSalary = job.get('jobExp',{}).get('expSalary', '').encode('utf-8')

        if not jobExpSalary:
            return

        find = re.search(r'(\d+)', jobExpSalary)
        if find:
            return
        else:

            if '面议' in jobExpSalary:
                print job.get('cvId'), " ==> ", "面议, 略过"
                return

            page_index = self.get_page_index(job.get('cvId', ''))
            print job.get('cvId'), " ==> ", jobExpSalary
            self.etl_runner.run(page_index)
            print "SUCESS COPIED %s" % job.get('cvId')

    def get_page_index(self, cvId):

        doc = self.bugfix.cv_page_store.get_one(cvId)

        pagecontent = self.bugfix.getPageContent(doc['pageContentPath'],'remote')
        if not pagecontent:
            print "cvId : %s, pagecontent null" % cvId

        job = {
            'indexUrl': doc['indexUrl'],
            'pagecontent': pagecontent,
            'updateTime':doc['updateTime'],
            'contentSign':doc['contentSign'],
        }

        return job
Exemple #2
0
class FixCvField(BaseTask):
    def __init__(self, channel, field, field_pattern):
        BaseTask.__init__(self, "fix_cv_field")
        self.channel = channel
        self.field = field
        self.field_pattern = field_pattern

        self.bugfix = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.bugfix.dispatcher()

        self.etl_runner = ETLRunnerFromRaw(channel)


    def run_job(self, job):

        try:
             if not isinstance(job, dict):
                return

             split_field = self.field.split('.')
             if len(split_field) == 2:
                 field_1 = getattr(job, split_field[0])
                 if isinstance(field_1, list):
                     for item in field_1:
                         field_2 = getattr(item, split_field[1])
                         assert isinstance(field_2, basestring)
                         if re.search(self.field_pattern, field_2):
                             # 匹配模式,重新量化
                             print "cvId: ", job.get('cvId'), " || ",
                             print self.field_pattern, " : ", field_2, " || ",
                             self.etl_runner.run(job)

        except Exception as e:
            traceback.print_exc()
Exemple #3
0
    def __init__(self, channel, thread_cnt=5):
        BaseTask.__init__(self, "fixbug", thread_cnt=thread_cnt)
        self.chanel = channel
        self.bugfix = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.bugfix.dispatcher()

        self.etl_runner = ETLRunner(channel)
Exemple #4
0
    def __init__(self, channel, field, field_pattern):
        BaseTask.__init__(self, "fix_cv_field")
        self.channel = channel
        self.field = field
        self.field_pattern = field_pattern

        self.bugfix = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.bugfix.dispatcher()

        self.etl_runner = ETLRunnerFromRaw(channel)
Exemple #5
0
    def __init__(self, channel, thread_cnt=5):

        BaseTask.__init__(self, "gets_raw", thread_cnt=thread_cnt)

        self.rawdispatcher = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.rawdispatcher.dispatcher()
        self.channel = channel

        self.dir_path = self.get_save_file_dir()

        CsvWriteBase.__init__(self, self.dir_path, self.channel)
Exemple #6
0
class GetsRaw(BaseTask, CsvWriteBase):
    def __init__(self, channel, thread_cnt=5):

        BaseTask.__init__(self, "gets_raw", thread_cnt=thread_cnt)

        self.rawdispatcher = BugFixDispatcher(channel, self._queue)
        self.dispatcher = lambda q: self.rawdispatcher.dispatcher()
        self.channel = channel

        self.dir_path = self.get_save_file_dir()

        CsvWriteBase.__init__(self, self.dir_path, self.channel)

    def get_save_file_dir(self):
        _dir = os.path.join(os.path.dirname(__file__), '%s_%s_result' % (self.channel, self._name))
        if not os.path.exists(_dir):
            os.mkdir(_dir)
        return _dir

    def is_need(self, doc):
        nowInc = doc.get('baseInfo').get('nowInc', '')
        if not nowInc:
            return False

        if isinstance(nowInc, str):
            nowInc = nowInc.decode('utf-8')
        if u'北大纵横' in nowInc or u'温氏集团' in nowInc or u'金诚信矿业管理' in nowInc:
            print doc.get('cvId'), ' ===> ', nowInc.encode('utf-8')
            return True
        if doc.get('jobList', []):
            for jobItem in doc.get('jobList'):
                incName = jobItem['incName']
                if isinstance(incName, str):
                    incName = incName.decode('utf-8')

                if u'北大纵横' in incName or u'温氏集团' in incName or u'金诚信矿业管理' in incName:
                    return True

        return False

    def run_job(self, job):
        if self.is_need(job):
            self.save(job, None)
            print 'Copied cvId: ', job.get('cvId')